Analysis and data associated with the manuscript: "Regenerative lineages and immune-mediated pruning in lung cancer metastasis", Laughney et al. (Nature Medicine 2019)
We used 10X droplet-based single-cell RNA sequencing (scRNA-seq) and graph-based phenotypic analysis to explore tumour cell heterogeneity through the lens of epithelial development and regeneration in lung adenocarcinoma primary tumors and metastases, to assess parallels between tumor cell plasticity and developmental hierarchies. Using patient primary and metastatic tumors, as well as a mouse model of lung cancer metastasis, our analyses provide evidence of regenerative cell types and lineage promiscuity in untreated primary tumours and phenotypic states spanning key stages of embryonic lung morphogenesis in metastases. This notebook walks the reader through key analyses and illustrates how to query annotated pre- and post-processed data.
For each data subset, we provide:
DFsubset = count matrix (cells x genes)
NDFsubset = library-size normalized count matrix (cells x genes)
INDFsubset = imputed and library-size normalized dataframe (cells x genes)
DIMENSIONSsubset = cells x dimensions (i.e. principal components, diffusion components)
GENE_RANK_MASTsubset = genes x cell group (DEG in cell group compared to all other cells)
fig_dpi = 300
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
import time
import os
import numpy as np
import seaborn as sns
import pandas as pd
import re
import h5py
from copy import deepcopy
from colors import rgb, hex
import scipy.cluster.hierarchy as hc
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches
import matplotlib.colors as colors
import matplotlib.font_manager
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Comic Sans MS'
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, fcluster, leaves_list, set_link_color_palette
import seqc
import seqc.plot
import seqc.stats
from pylab import *
from scipy.stats import zscore
import cmocean
from scipy.stats import linregress
def gauss(x,mu,sigma,A):
return A*exp(-(x-mu)**2/2/sigma**2)
def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)
def rgb2hex(r,g,b):
hex = "#{:02x}{:02x}{:02x}".format(r,g,b)
return hex
def hex2rgb(hexcode):
rgb = tuple(map(ord,hexcode[1:].decode('hex')))
return rgb
import scipy.cluster.hierarchy as sch
import re
_nsre = re.compile('([0-9]+)')
def natural_sort_key(s):
return [int(text) if text.isdigit() else text.lower()
for text in re.split(_nsre, s)]
%matplotlib inline
# H5 WRAPPER
class H5:
def __init__(self, archive_name: str):
"""Wrapper for the pandas HDFStore class which ensures that all interactions with
the archive result in a closed, flushed archive.
In order to ensure data usability, all data must be submitted in DataFrame format.
This decision was made to encourage users to pair metadata with sequencing data,
and reduce the incidence of unexpected data permutation.
:param archive_name: name of the h5 archive to open. If the archive does not exist
it will be created using a blosc5 filter
:method ls: list contents of the archive
:method save: save an object to the h5 archive
:method load: load an object from the archive
:method remove: remove a DataFrame from the archive
:method is_open: returns True if the h5 archive is open, else False
"""
if os.path.isfile(archive_name):
self._archive = pd.HDFStore(archive_name, mode='a')
self._archive.close()
else:
self._archive = pd.HDFStore(
archive_name, mode='a', complib='blosc', complevel=5)
self._archive.close()
def __repr__(self):
self._archive.open()
try:
return repr(self._archive)
finally:
self._archive.close()
def save(self, data: pd.DataFrame, location: str) -> None:
"""Save DataFrame data to the h5 archive in location.
:param data: DataFrame object to store
:param location: filepath to save the object in the h5 hierarchy
"""
if not isinstance(data, pd.DataFrame):
if isinstance(data, np.ndarray):
res = input('np.ndarray class detected. Save as pd.DataFrame with '
'ascending integer indices? [y/n] ')
if res in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
data = pd.DataFrame(data)
else:
print('User elected not to save DataFrame, archive is unmodified.')
return
else:
raise TypeError('only pd.DataFrame objects can be saved using this '
'class. To save np.ndarray objects please see the tables '
'package.')
self._archive.open()
try:
self._archive[location] = data
finally:
self._archive.close()
def load(self, location: str) -> None:
"""Load and return the dataframe found at location in the archive.
:param location: str, location of object to retrieve from h5
:return: pd.DataFrame, object found at location
"""
self._archive.open()
try:
return self._archive[location]
finally:
self._archive.close()
def ls(self) -> None:
"""list archive contents"""
try:
self._archive.open()
print(self._archive.keys())
finally:
self._archive.close()
def remove(self, location: str) -> None:
"""remove the DataFrame at location from the archive
Note: removing a dataframe at a branch node will remove all leaves sharing this
prefix. e.g. in an archive containing:
/data
/data/filtered
/data/metadata
/new_data/data
removing /data would remove the first three DataFrame objects from the archive.
:param location: location of DataFrame to remove
:return: None
"""
self._archive.open()
try:
if location not in self._archive.keys():
raise ValueError(
'{} not contained in archive, nothing to remove.'.format(location))
else:
removed = [k for k in self._archive.keys()
if k.startswith(location + '/')]
if len(removed) != 0:
res = input(
'Removing branch node {}, which is a prefix for {!a} will remove '
'all listed DataFrames. Continue with removal? [y/n] '.format(
location, removed))
if res not in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
print('returned without deletion.')
return
self._archive.remove(location)
finally:
self._archive.close()
@property
def is_open(self) -> bool:
return self._archive.is_open
# FUNCTIONS FOR DOT PLOT
def _plot_gene_groups_brackets(gene_groups_ax, group_positions, group_labels,
left_adjustment=-0.3, right_adjustment=0.3, rotation=None, orientation='top'):
"""
Draws brackets that represent groups of genes on the give axis.
For best results, this axis is located on top of an image whose
x axis contains gene names.
The gene_groups_ax should share the x axis with the main ax.
Eg: gene_groups_ax = fig.add_subplot(axs[0, 0], sharex=dot_ax)
This function is used by dotplot, heatmap etc.
Parameters
----------
gene_groups_ax : matplotlib axis
In this axis the gene marks are drawn
group_positions : list of `tuples`
Each item in the list, should contain the start and end position that the
bracket should cover.
Eg. [(0, 4), (5, 8)] means that there are two brackets, one for the var_names (eg genes)
in positions 0-4 and other for positions 5-8
group_labels : list
List of group labels
left_adjustment : `float`
adjustment to plot the bracket start slightly before or after the first gene position.
If the value is negative the start is moved before.
right_adjustment : `float`
adjustment to plot the bracket end slightly before or after the last gene position
If the value is negative the start is moved before.
rotation : `float` (default None)
rotation degrees for the labels. If not given, small labels (<4 characters) are not
rotated, otherwise, they are rotated 90 degrees
orientation : `str` (default `top`)
location of the brackets. Either `top` or `right`
Returns
-------
None
"""
import matplotlib.patches as patches
from matplotlib.path import Path
# get the 'brackets' coordinates as lists of start and end positions
left = [x[0] + left_adjustment for x in group_positions]
right = [x[1] + right_adjustment for x in group_positions]
# verts and codes are used by PathPatch to make the brackets
verts = []
codes = []
if orientation == 'top':
# rotate labels if any of them is longer than 4 characters
if rotation is None and group_labels is not None and len(group_labels) > 0:
if max([len(x) for x in group_labels]) > 4:
rotation = 90
else:
rotation = 0
for idx in range(len(left)):
verts.append((left[idx], 0)) # lower-left
verts.append((left[idx], 0.6)) # upper-left
verts.append((right[idx], 0.6)) # upper-right
verts.append((right[idx], 0)) # lower-right
codes.append(Path.MOVETO)
codes.append(Path.LINETO)
codes.append(Path.LINETO)
codes.append(Path.LINETO)
try:
group_x_center = left[idx] + float(right[idx] - left[idx]) / 2
gene_groups_ax.text(group_x_center, 1.1, group_labels[idx], ha='center',
va='bottom', rotation=rotation)
except:
pass
else:
top = left
bottom = right
for idx in range(len(top)):
verts.append((0, top[idx])) # upper-left
verts.append((0.15, top[idx])) # upper-right
verts.append((0.15, bottom[idx])) # lower-right
verts.append((0, bottom[idx])) # lower-left
codes.append(Path.MOVETO)
codes.append(Path.LINETO)
codes.append(Path.LINETO)
codes.append(Path.LINETO)
try:
diff = bottom[idx] - top[idx]
group_y_center = top[idx] + float(diff) / 2
if diff * 2 < len(group_labels[idx]):
# cut label to fit available space
group_labels[idx] = group_labels[idx][:int(diff * 2)] + "."
gene_groups_ax.text(0.6, group_y_center, group_labels[idx], ha='right',
va='center', rotation=270, fontsize='small')
except Exception as e:
print('problems {}'.format(e))
pass
path = Path(verts, codes)
patch = patches.PathPatch(path, facecolor='None', lw=1.5,edgecolor='k')
gene_groups_ax.add_patch(patch)
gene_groups_ax.spines['right'].set_visible(False)
gene_groups_ax.spines['top'].set_visible(False)
gene_groups_ax.spines['left'].set_visible(False)
gene_groups_ax.spines['bottom'].set_visible(False)
gene_groups_ax.grid(False)
# remove y ticks
gene_groups_ax.tick_params(axis='y', left=False, labelleft=False)
# remove x ticks and labels
gene_groups_ax.tick_params(axis='x', bottom=False, labelbottom=False, labeltop=False)
#gene_groups_ax.tick_params(direction='out', length=6, width=2, colors='r',
# grid_color='r', grid_alpha=0.5)
def dotplot(adata, var_names, categories, groupby=None, use_raw=None, log=False, num_categories=7,
expression_cutoff=0., mean_only_expressed=False, color_map='Reds', dot_max=None,
dot_min=None, figsize=None, dendrogram=False, gene_symbols=None,
var_group_positions=None, standard_scale=None, smallest_dot=0.,
var_group_labels=None, var_group_rotation=None, layer=None, show=None,
save=None, **kwds):
"""
Makes a *dot plot* of the expression values of `var_names`.
For each var_name and each `groupby` category a dot is plotted. Each dot
represents two values: mean expression within each category (visualized by
color) and fraction of cells expressing the var_name in the
category (visualized by the size of the dot). If groupby is not given, the
dotplot assumes that all data belongs to a single category.
**Note**: A gene is considered expressed if the expression value in the adata
(or adata.raw) is above the specified threshold which is zero by default.
An example of dotplot usage is to visualize, for multiple marker genes,
the mean value and the percentage of cells expressing the gene accross multiple clusters.
Parameters
----------
{common_plot_args}
expression_cutoff : `float` (default: `0.`)
Expression cutoff that is used for binarizing the gene expression and determining the fraction
of cells expressing given genes. A gene is expressed only if the expression value is greater than
this threshold.
mean_only_expressed : `bool` (default: `False`)
If True, gene expression is averaged only over the cells expressing the given genes.
color_map : `str`, optional (default: `Reds`)
String denoting matplotlib color map.
dot_max : `float` optional (default: `None`)
If none, the maximum dot size is set to the maximum fraction value found (e.g. 0.6). If given,
the value should be a number between 0 and 1. All fractions larger than dot_max are clipped to
this value.
dot_min : `float` optional (default: `None`)
If none, the minimum dot size is set to 0. If given,
the value should be a number between 0 and 1. All fractions smaller than dot_min are clipped to
this value.
standard_scale : {{'var', 'group'}}, optional (default: None)
Whether or not to standardize that dimension between 0 and 1, meaning for each variable or group,
subtract the minimum and divide each by its maximum.
smallest_dot : `float` optional (default: 0.)
If none, the smallest dot has size 0. All expression levels with `dot_min` are potted with
`smallest_dot` dot size.
{show_save_ax}
**kwds : keyword arguments
Are passed to `matplotlib.pyplot.scatter`.
Returns
-------
List of :class:`~matplotlib.axes.Axes`
Examples
-------
>>> adata = sc.datasets.pbmc68k_reduced()
>>> sc.pl.dotplot(adata, ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ'],
... groupby='bulk_labels', dendrogram=True)
"""
if isinstance(var_names, str):
var_names = [var_names]
#categories, obs_tidy = _prepare_dataframe(adata, var_names, groupby, use_raw, log, num_categories,
# layer=layer, gene_symbols=gene_symbols)
obs_tidy = adata
# for if category defined by groupby (if any) compute for each var_name
# 1. the fraction of cells in the category having a value > expression_cutoff
# 2. the mean value over the category
# 1. compute fraction of cells having value > expression_cutoff
# transform obs_tidy into boolean matrix using the expression_cutoff
obs_bool = obs_tidy > expression_cutoff
# compute the sum per group which in the boolean matrix this is the number
# of values > expression_cutoff, and divide the result by the total number of values
# in the group (given by `count()`)
fraction_obs = obs_bool.groupby(level=groupby,axis=0).sum() / obs_bool.groupby(level=groupby,axis=0).count()
fraction_obs = fraction_obs.ix[categories,:] #re-order by categories
# 2. compute mean value
if mean_only_expressed:
mean_obs = obs_tidy.mask(~obs_bool).groupby(level=groupby,axis=0).mean().fillna(0)
mean_obs = mean_obs.ix[categories,:] #re-order by categories
else:
mean_obs = obs_tidy.groupby(level=groupby,axis=0).mean()
if standard_scale == 'group':
mean_obs = mean_obs.sub(mean_obs.min(1), axis=0)
mean_obs = mean_obs.div(mean_obs.max(1), axis=0).fillna(0)
elif standard_scale == 'var':
mean_obs -= mean_obs.min(0)
mean_obs = (mean_obs / mean_obs.max(0)).fillna(0)
elif standard_scale is None:
pass
else:
logg.warn('Unknown type for standard_scale, ignored')
dendro_width = 0.8 if dendrogram else 0
colorbar_width = 0.2
colorbar_width_spacer = 0.5
size_legend_width = 0.25
if figsize is None:
height = len(categories) * 0.3 + 1 # +1 for labels
# if the number of categories is small (eg 1 or 2) use
# a larger height
height = max([1.5, height])
heatmap_width = len(var_names) * 0.35
width = heatmap_width + colorbar_width + size_legend_width + dendro_width + colorbar_width_spacer
else:
width, height = figsize
heatmap_width = width - (colorbar_width + size_legend_width + dendro_width + colorbar_width_spacer)
# colorbar ax width should not change with differences in the width of the image
# otherwise can become too small
if var_group_positions is not None and len(var_group_positions) > 0:
# add some space in case 'brackets' want to be plotted on top of the image
height_ratios = [0.5, 10]
else:
height_ratios = [0, 10.5]
# define a layout of 2 rows x 5 columns
# first row is for 'brackets' (if no brackets needed, the height of this row is zero)
# second row is for main content. This second row
# is divided into 4 axes:
# first ax is for the main figure
# second ax is for dendrogram (if present)
# third ax is for the color bar legend
# fourth ax is for an spacer that avoids the ticks
# from the color bar to be hidden beneath the size lengend axis
# fifth ax is to plot the size legend
fig = plt.figure(figsize=(width, height))
axs = gridspec.GridSpec(nrows=2, ncols=5, wspace=0.02, hspace=0.04,
width_ratios=[heatmap_width, dendro_width, colorbar_width, colorbar_width_spacer, size_legend_width],
height_ratios=height_ratios)
if len(categories) < 4:
# when few categories are shown, the colorbar and size legend
# need to be larger than the main plot, otherwise they would look
# compressed. For this, the dotplot ax is split into two:
axs2 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=axs[1, 0],
height_ratios=[len(categories) * 0.3, 1])
dot_ax = fig.add_subplot(axs2[0])
else:
dot_ax = fig.add_subplot(axs[1, 0])
color_legend = fig.add_subplot(axs[1, 2])
# to keep the size_legen of about the same height, irrespective
# of the number of categories, the fourth ax is subdivided into two parts
size_legend_height = min(1.3, height)
# wspace is proportional to the width but a constant value is
# needed such that the spacing is the same for thinner or wider images.
wspace = 10.5 / width
axs3 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=axs[1, 4], wspace=wspace,
height_ratios=[size_legend_height / height,
(height - size_legend_height) / height])
# make scatter plot in which
# x = var_names
# y = groupby category
# size = fraction
# color = mean expression
y, x = np.indices(mean_obs.shape)
y = y.flatten()
x = x.flatten()
frac = fraction_obs.values.flatten()
mean_flat = mean_obs.values.flatten()
cmap = plt.get_cmap(color_map)
if dot_max is None:
dot_max = np.ceil(max(frac) * 10) / 10
else:
if dot_max < 0 or dot_max > 1:
raise ValueError("`dot_max` value has to be between 0 and 1")
if dot_min is None:
dot_min = 0
else:
if dot_min < 0 or dot_min > 1:
raise ValueError("`dot_min` value has to be between 0 and 1")
if dot_min != 0 or dot_max != 1:
# clip frac between dot_min and dot_max
frac = np.clip(frac, dot_min, dot_max)
old_range = dot_max - dot_min
# re-scale frac between 0 and 1
frac = ((frac - dot_min) / old_range)
size = (frac * 10) ** 2
size += smallest_dot
import matplotlib.colors
normalize = matplotlib.colors.Normalize(vmin=kwds.get('vmin'), vmax=kwds.get('vmax'))
colors = cmap(normalize(mean_flat))
dot_ax.scatter(x, y, color=colors, s=size, cmap=cmap, norm=None, edgecolor='none', **kwds)
y_ticks = range(mean_obs.shape[0])
dot_ax.set_yticks(y_ticks)
dot_ax.set_yticklabels([mean_obs.index[idx] for idx in y_ticks])
x_ticks = range(mean_obs.shape[1])
dot_ax.set_xticks(x_ticks)
dot_ax.set_xticklabels([mean_obs.columns[idx] for idx in x_ticks], rotation=90)
dot_ax.tick_params(axis='both', labelsize='small')
dot_ax.grid(False)
dot_ax.set_xlim(-0.5, len(var_names) + 0.5)
dot_ax.set_ylabel(groupby)
# to be consistent with the heatmap plot, is better to
# invert the order of the y-axis, such that the first group is on
# top
ymin, ymax = dot_ax.get_ylim()
dot_ax.set_ylim(ymax+0.5, ymin - 0.5)
dot_ax.set_xlim(-1, len(var_names))
# plot group legends on top of dot_ax (if given)
if var_group_positions is not None and len(var_group_positions) > 0:
gene_groups_ax = fig.add_subplot(axs[0, 0], sharex=dot_ax)
_plot_gene_groups_brackets(gene_groups_ax, group_positions=var_group_positions,
group_labels=var_group_labels,
rotation=var_group_rotation,
orientation = 'top')
# plot colorbar
import matplotlib.colorbar
matplotlib.colorbar.ColorbarBase(color_legend, cmap=cmap, norm=normalize)
# for the dot size legend, use step between dot_max and dot_min
# based on how different they are.
diff = dot_max - dot_min
if 0.3 < diff <= 0.6:
step = 0.1
elif diff <= 0.3:
step = 0.05
else:
step = 0.2
# a descending range that is afterwards inverted is used
# to guarantee that dot_max is in the legend.
fracs_legends = np.arange(dot_max, dot_min, step * -1)[::-1]
if dot_min != 0 or dot_max != 1:
fracs_values = ((fracs_legends - dot_min) / old_range)
else:
fracs_values = fracs_legends
size = (fracs_values * 10) ** 2
size += smallest_dot
color = [cmap(normalize(value)) for value in np.repeat(max(mean_flat) * 0.7, len(size))]
# plot size bar
size_legend = fig.add_subplot(axs3[0])
size_legend.scatter(np.repeat(0, len(size)), range(len(size)), s=size, color=color)
size_legend.set_yticks(range(len(size)))
labels = ["{:.0%}".format(x) for x in fracs_legends]
if dot_max < 1:
labels[-1] = ">" + labels[-1]
size_legend.set_yticklabels(labels)
size_legend.set_yticklabels(["{:.0%}".format(x) for x in fracs_legends])
size_legend.tick_params(axis='y', left=False, labelleft=False, labelright=True)
# remove x ticks and labels
size_legend.tick_params(axis='x', bottom=False, labelbottom=False)
# remove surrounding lines
size_legend.spines['right'].set_visible(False)
size_legend.spines['top'].set_visible(False)
size_legend.spines['left'].set_visible(False)
size_legend.spines['bottom'].set_visible(False)
size_legend.grid(False)
ymin, ymax = size_legend.get_ylim()
size_legend.set_ylim(ymin, ymax+0.5)
#utils.savefig_or_show('dotplot', show=show, save=save)
return axs
# PATH TO H5 WITH PROCESSED DATAFRAMES
DATA_PATH = './data/'#'/ifs/e63data/massaguelab/ashley/data/sc_RNAseq/h5/processed/Project_AL_H2087_LCC/manuscript/'
FN = 'PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED.h5'
h5_data = H5(DATA_PATH + FN)
h5_data.ls()
# SPECIFY SAVE DIRECTORIES
tag = 'demo' # optional key word to append to directory name
now = time.strftime("%x")
now = str.replace(now,'/','_')
# SPECIFY OUTPUT STEMS FOR FIGURES/PATHWAY ANALYSIS
FIG_output_stem = "./figures/" + FN.replace('.h5','{}_'.format(tag)) + now +'/'
print(FIG_output_stem)
# CREATE DIRECTORIES IF THEY DO NOT EXIST
# CREATE FIGURE DIRECTORY IF IT DOES NOT EXIST
d = os.path.dirname(FIG_output_stem)
if not os.path.exists(d):
os.makedirs(d)
# --- DEFINE ALL COLORMAPS UPFRONT FOR UNIFORMITY ---
# CATEGORICAL: SAMPLE
FLATUI_SAMPLE = [
'800000', #'RU255B_BRAINMET_4S_TR' --> BRAIN MET (TR) FF4040 REDS
'551A8B', #'RU653_TUMOR_1AS_UTR' --> PRIMARY 551A8B purple
'0000EE', #'RU661_TUMOR_1AS_UTR' -- PRIMARY 0000EE blue
'FF00FF', #'RU666_SPINEMET_4NS_TR' --> SPINE MET (TR) 8B0000 REDS
'EEEE00', #'RU675_NORMAL_1ANS_UTR' --> NORMAL EEEE00 YELLOWS
'4682B4', #'RU675_TUMOR_1ANS_UTR' --> PRIMARY 4682B4 steel blue
'63B8FF', #'RU676_TUMOR_1AS_UTR' --> PRIMARY 63B8FF steel blue 1
'9400D3', #'RU679_TUMOR_2AS_TR' --> PRIMARY (TR) 9400D3 DARK VIOLET
'33A1C9', #'RU680_TUMOR_1AS_UTR' --> PRIMARY 33A1C9 peacock
'DC143C', #'RU681_BRAINMET_4S_TR' --> BRAIN MET (TR) CD0000 REDS
'CDCD00', #'RU682_NORMAL_2AS_UTR' --> NORMAL CDCD00 YELLOWS
'00C5CD', #'RU682_TUMOR_2AS_UTR' --> PRIMARY 00C5CD turquois
'8B8B00', #'RU684_NORMAL_1AS_UTR'--> NORMAL 8B8B00 YELLOWS
'03A89E', #'RU684_TUMOR_1AS_UTR' --> PRIMARY 03A89E maganese blue
'808069', #'RU685_NORMAL_1ANS_UTR' --> NORMAL 808069 YELLOWS
'FF1493', # RU699_ADRENAL_MET
'FF4500', #RU701_BRAINMET
]
# CATEGORICAL: NORMAL SAMPLES
FLATUI_NORMAL = [
'EEEE00', #'RU675_NORMAL_1ANS_UTR' --> NORMAL EEEE00 YELLOWS
'CDCD00', #'RU682_NORMAL_2AS_UTR' --> NORMAL CDCD00 YELLOWS
'8B8B00', #'RU684_NORMAL_1AS_UTR'--> NORMAL 8B8B00 YELLOWS
'808069', #'RU685_NORMAL_1ANS_UTR' --> NORMAL 808069 YELLOWS
]
FLATUI_CANCERandNORMAL = [
'551A8B', #'RU653_TUMOR_1AS_UTR' --> PRIMARY 551A8B purple
'0000EE', #'RU661_TUMOR_1AS_UTR' -- PRIMARY 0000EE blue
'EEEE00', #'RU675_NORMAL_1ANS_UTR' --> NORMAL EEEE00 YELLOWS
'4682B4', #'RU675_TUMOR_1ANS_UTR' --> PRIMARY 4682B4 steel blue
'63B8FF', #'RU676_TUMOR_1AS_UTR' --> PRIMARY 63B8FF steel blue 1
'9400D3', #'RU679_TUMOR_2AS_TR' --> PRIMARY (TR) 9400D3 DARK VIOLET
'33A1C9', #'RU680_TUMOR_1AS_UTR' --> PRIMARY 33A1C9 peacockEDS
'CDCD00', #'RU682_NORMAL_2AS_UTR' --> NORMAL CDCD00 YELLOWS
'00C5CD', #'RU682_TUMOR_2AS_UTR' --> PRIMARY 00C5CD turquois
'8B8B00', #'RU684_NORMAL_1AS_UTR'--> NORMAL 8B8B00 YELLOWS
'03A89E', #'RU684_TUMOR_1AS_UTR' --> PRIMARY 03A89E maganese blue
'808069', #'RU685_NORMAL_1ANS_UTR' --> NORMAL 808069 YELLOWS
]
FLATUI_SOURCE = [
'FF0000', # MET
'E6E6FA', # NORMAL
'000000', # PRIMARY
]
FLATUI_PATIENT = [
'FF4040', # 255B
'551A8B', # 653
'0000EE', # 661
'8B0000', # 666
'4682B4', # 675
'63B8FF', # 676
'CD00CD', # 679
'9400D3', # 680
'33A1C9', # 681
'CD0000', # 682
'00C5CD', # 684
'03A89E', # 685
'808069', # 699
'99FFFF', # 701
]
FLATUI_CLASS = [
'0000FF', # 0
'FF0000', # 1
'FF8C00', # 2
'FFD700', # 3
'808000', # 4
'9ACD32', # 5
'44F544', # 6
'20B2AA', # 7
'2F4F4F', # 8
'00BFFF', # 9
'147ADF', # 10
'191970', # 11
'6200A4', # 12
'8B008B', # 13
'FF00FF', # 14
'FF1493', # 15
'8B4513', # 16
'D2691E', # 17
'B0C4DE', # 18
'696969', # 19
'FF586E', # 20
'FAEBD7', # 21
'E2C792', # 22
'FFF7A4', # 23
'83715A', # 24
'BC8F8F', # 25
'4C718E', # 26
'519395', # 27
'7FFFD4', # 28
'0F830F', # 29
'D4FFF0', # 30
'B72222', # 31
'FDBABA', # 32
'1B1010', # 33
'CD57FF', # 34
'F0FFFF', # 35
]
# UPDATE COLORMAP
FLATUI_CELL_TYPE = [
'014E4E', # Breg
'800080', # DENDRITIC
'490049', # DENDRITIC (ACTIVATED)
'0000FF', # ENDOTHELIAL
'BC8F8F', # EPITHELIAL
'00994C', # FIBROBLAST
'000000', # IG
'000080', # MACROPHAGE
'B0C4DE', # MAST
'FF00FF', # MDSC
'0080FF', # MICROGLIA/MACROPHAGE
'D8BFD8', # MONOCYTE
'983DFF', # MONOCYTE (ACTIVATED)
'FF0000', # NK
'800000', # NKT
'00CCCC', # PERICYTE
'00FF7F', # PROLIFERATING MESENCHYMAL PROGENITOR
'FFD700', # Th
'994C00', # Tm
'FF8000', # Treg
]
# METACELL TYPE TO HEX
FLATUI_METACELL = [
'00008B', # LYMPHOID
'8B008B', # MYELOID
'BC8F8F', # OTHER
]
FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS = [
'0000FF', # 0
'FF0000', # 1
'FF8C00', # 2
'FFD700', # 3
'808000', # 4
'9ACD32', # 5
'44F544', # 6
'20B2AA', # 7
'2F4F4F', # 8
'00BFFF', # 9
'147ADF', # 10
'191970', # 11
'6200A4', # 12
'8B008B', # 13
'FF00FF', # 14
'FF1493', # 15
'8B4513', # 16
'B0C4DE', # 17
'606060', # 18
'696969', # 19
'FF586E', # 20
'FAEBD7', # 21
'E2C792', # 22
'FFF7A4', # 23
'83715A', # 24
'BC8F8F', # 25
'4C718E', # 26
'519395', # 27
'7FFFD4', # 28
'0F830F', # 29
'D4FFF0', # 30
'B72222', # 31
'FDBABA', # 32
'1B1010', # 33
'CD57FF', # 34
'CCFFFF', # 34 **
'CD853F', # 36
'FF7F50', # 37
'C692FF', #38
'433355', # 39
]
FLATUI_CELL_TYPE_MYELOID_OTHER_ALL = [
'800080', # DENDRITIC (purple)
'490049', # DENDRITIC ACTIVATED (purple)
'0000FF', # ENDOTHELIAL (RED)
'BC8F8F', # EPITHELIAL (tan)
'00994C', # FIBROBLAST (dark green
'000080', # MACROPHAGE (blue)
'B0C4DE', # MAST (purple/gray)
'FF00FF', # MDSC (light blue/grey)
'0080FF', # MICROGLIA/MACROPHAGE
'D8BFD8', # MONOCYTE (magenta))
'983DFF', # MONOCYTE ACTIVATED(magenta))
'00CCCC', # PERICYTE (bright blue)
'00FF7F', # PROLIFERATING MESENCHYMAL PROGENITOR (bright green)
]
FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS = [
'0000FF', # 0
'FF0000', # 1
'FFD700', # 2
'808000', # 3
'006400', # 4
'20B2AA', # 5
'00BFFF', # 6
'1E90FF', # 7
'4B0082', # 8
'8B008B', # 9
'FF1493', # 10
'D2691E', # 11
'B0C4DE', # 12
'FAEBD7', # 14
'FFFACD', # 15
'D2B48C', # 16
'5F9EA0', # 18
'8FBC8F', # 19
'00FA9A', # 20
'66CDAA', # 21
'3CB371', # 22
]
FLATUI_CELL_TYPE_LYMPHOID_ALL = ['014E4E', # Breg
'000000', # IG
'FF0000', # NK
'800000', # NKT
'FFD700', # Th
'994C00', # Tm
'FF8000', # Treg
]
# UPDATE COLOR MAP TO ASSOCIATE LINEAGE COLOR WITH EACH CLASS
FLATUI_CLASS_NORMAL_LINEAGES = [
'FF66FF', # AEC1, CLASS 2
'6600CC', # AEC2, CLASS 0
'3399FF', # CILIATED, CLASS 1
'FF8800', # CLUB/CLARA, CLASS 3
]
FLATUI_PHENOGRAPH_CLASS_NORMAL = [
'6600CC', # AT2, CLASS 0
'3399FF', # CILIATED, CLASS 1
'FF66FF', # AT1, CLASS 2
'FF8800', # CLUB/CLARA, CLASS 3
]
# DEFINE LINEAGE COLORMAP
FLATUI_LINEAGE = [
'FF66FF', # AEC1 pink
'6600CC', # AEC2 purple
'9ACD32', # AEP/MUCINOUS
'3399FF', # CILIATED skye blue
'FF8800', # CLUB orange
'0000FF', # KRT5+BASAL (MESENCHYMAL/KRAS HIGH),#'0000FF' dark blue
'FFD700', # PNEC
'E6E6FA', # STEM/PROGENITOR
]
FLATUI_SOURCE_RX = [
'FF0000', # MET
'E6E6FA', # NORMAL
'000000', # PRIMARY
'FF8000', # PRIMARY_RX
]
FLATUI_CLASS_ALL_EP = [
'0000FF', # 0 BLUE
'FF0000', # 1 RED
'FF8C00', # 2 ROANGE
'FFD700', # 3 YELLOW
'808000', # 4 Dark yellow
'9ACD32', # 5 Lime Green
'006400', # 6 Dark green
'20B2AA', # 7 Teal
'2F4F4F', # 8 GREY DARK
'00BFFF', # 9 CYAN
'191970', # 11 DARK PURPLE
'4B0082', # 12 LIGHTER PURPLE
'8B008B', # 13 PINK-PURPLE
'FF00FF', # 14 MAGENTA
'FF1493', # 15 CHERRY
'8B4513', # 16 BROWN
'B0C4DE', # 19 LIGHT BLUEGREY
'696969', # 20
'FFB6C1', # 21 #SALMON
'FAEBD7', # 22 # TAN
'F5DEB3', # 23
'00CCCC', # 24 TEAL
'BC8F8F', # 26
]
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_CLASS_ALL_EP),3))
for ind,hexcolor in enumerate(FLATUI_CLASS_ALL_EP):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS_ALL_EP = LinearSegmentedColormap.from_list('FLATUI_CLASS_ALL_EP', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_SAMPLE),3))
for ind,hexcolor in enumerate(FLATUI_SAMPLE):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SAMPLES = LinearSegmentedColormap.from_list('FLATUI_SAMPLE', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_NORMAL)
colors = np.zeros((len(FLATUI_NORMAL),3))
for ind,hexcolor in enumerate(FLATUI_NORMAL):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_NORMAL = LinearSegmentedColormap.from_list('FLATUI_NORMAL', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_CANCER & NORMAL)
colors = np.zeros((len(FLATUI_CANCERandNORMAL),3))
for ind,hexcolor in enumerate(FLATUI_CANCERandNORMAL):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CANCERandNORMAL = LinearSegmentedColormap.from_list('FLATUI_CANCERandNORMAL', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_SOURCE)
colors = np.zeros((len(FLATUI_SOURCE),3))
for ind,hexcolor in enumerate(FLATUI_SOURCE):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SOURCE = LinearSegmentedColormap.from_list('FLATUI_SOURCE', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_PATIENT)
colors = np.zeros((len(FLATUI_PATIENT),3))
for ind,hexcolor in enumerate(FLATUI_PATIENT):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_PATIENT = LinearSegmentedColormap.from_list('FLATUI_PATIENT', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_CLASS),3))
for ind,hexcolor in enumerate(FLATUI_CLASS):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS = LinearSegmentedColormap.from_list('FLATUI_CLASS', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_METACELL)
colors = np.zeros((len(FLATUI_METACELL),3))
for ind,hexcolor in enumerate(FLATUI_METACELL):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_METACELL = LinearSegmentedColormap.from_list('FLATUI_METACELL', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_CELL_TYPE),3))
for ind,hexcolor in enumerate(FLATUI_CELL_TYPE):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CELL_TYPE = LinearSegmentedColormap.from_list('FLATUI_CELL_TYPE', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS),3))
for ind,hexcolor in enumerate(FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS = LinearSegmentedColormap.from_list('FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_CELL_TYPE_MYELOID_OTHER_ALL),3))
for ind,hexcolor in enumerate(FLATUI_CELL_TYPE_MYELOID_OTHER_ALL):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CELL_TYPE_MYELOID_OTHER_ALL = LinearSegmentedColormap.from_list('FLATUI_CELL_TYPE_MYELOID_OTHER_ALL', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS),3))
for ind,hexcolor in enumerate(FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_LYMPHOID_ALL_PHENOGRAPH_CLASS = LinearSegmentedColormap.from_list('FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_CELL_TYPE_LYMPHOID_ALL),3))
for ind,hexcolor in enumerate(FLATUI_CELL_TYPE_LYMPHOID_ALL):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CELL_TYPE_LYMPHOID_ALL = LinearSegmentedColormap.from_list('FLATUI_CELL_TYPE_LYMPHOID_ALL', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PHENOGRAPH_CLASS_NORMAL),3))
for ind,hexcolor in enumerate(FLATUI_PHENOGRAPH_CLASS_NORMAL):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_PHENOGRAPH_CLASS_NORMAL = LinearSegmentedColormap.from_list('FLATUI_PHENOGRAPH_CLASS_NORMAL', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_CLASS_NORMAL_LINEAGES),3))
for ind,hexcolor in enumerate(FLATUI_CLASS_NORMAL_LINEAGES):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS_NORMAL_LINEAGES = LinearSegmentedColormap.from_list('FLATUI_CLASS_NORMAL_LINEAGES', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_LINEAGE),3))
for ind,hexcolor in enumerate(FLATUI_LINEAGE):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_LINEAGE = LinearSegmentedColormap.from_list('FLATUI_LINEAGE', colors, N=len(colors))
# CONVERT HEX TO RGB (FLATUI_METACELL)
colors = np.zeros((len(FLATUI_SOURCE_RX),3))
for ind,hexcolor in enumerate(FLATUI_SOURCE_RX):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SOURCE_RX = LinearSegmentedColormap.from_list('FLATUI_SOURCE_RX', colors, N=len(colors))
# CONTINUOUS: DIVERGING
CM_DIVERGING = plt.cm.RdBu_r
# CONTINUOUS: SEQUENTIAL
CM_SEQUENTIAL = cmocean.cm.thermal
We profiled the transcriptomes of 40,505 single cells obtained from 17 freshly resected human tissue samples comprising adjacent non-tumour involved lung (n = 4, hereafter referred to as normal lung), primary lung adenocarcinoma (n = 8; 7 untreated and 1 post neo-adjuvant chemotherapy), as well as brain (n = 3), bone (n = 1), and adrenal (n = 1) lung adenocarcinoma metastases (Fig. 1a). These samples were derived from patients spanning various stages of tumour progression (Extended Data Table 1) without enrichment for a specific cell type, such that the viable tumour sample and its microenvironment were sampled in an unbiased manner. scRNA-seq data from all patients were merged to create a global cell atlas of the normal lung, primary tumours, and metastases. In this section, we walk through cell type annotation within this global atlas.
Processed data is stored as H5 with the following files, sharing common multi-index
# SPECIFY DATA SUBSET
subset_type = 'ALL'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
exec('QUERY = DF_{}'.format(subset_type))
plt.figure(figsize = (9,3))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.5, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG10 LIBRARY SIZE
ax = plt.subplot(gs1[0])
bins = np.linspace(np.log10(QUERY.values.sum(axis=1).min()), np.log10(QUERY.values.sum(axis=1).max())*0.95, 100)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
x = np.log10(QUERY.select(lambda x: x[1] in label).values.sum(axis=1)) # log cell size
plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255))
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog10 Cell Size')
plt.grid(True)
sns.despine()
# (2) PLOT LOG NUM UNIQUE GENES
ax = plt.subplot(gs1[1])
bins = np.linspace(np.log10((np.sum(QUERY > 0,axis=1)).min()),
np.log10((np.sum(QUERY > 0,axis=1)).max())*0.95, 100)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
x = np.log10(np.sum(QUERY.select(lambda x: x[1] in label) > 0,axis=1))
plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255))
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog10 Unique genes')
plt.grid(True)
sns.despine()
# (3) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
ax = plt.subplot(gs1[2])
data = np.log10(np.sum(QUERY.values > 0,axis=0))
bins = np.linspace(0, data.max()*0.95, 20)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
x = np.log10(np.sum(QUERY.select(lambda x: x[1] in label) > 0,axis=0))
x[(np.isinf(x)) | (np.isnan(x))] = 0
plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255))
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog. Cells/Gene')
plt.grid(True)
sns.despine()
# Add (abbreviated) legend bottom left
L = plt.legend(loc='upper right',prop={'size':6},bbox_to_anchor=(2, 0.95),fancybox=True)
# SAVE FIGURE
figure_label = '_library_distribution_subset_{}_log10'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label + '.png'
plt.savefig(fn, dpi=fig_dpi)
print(fn)
# LOAD CANONICAL CELL TYPE MARKERS FROM EXCEL FILE
# THIS CORRESPONDS TO SUPPLEMENTARY_TABLE2/TAB1 (Gene Signatures from ED Fig. 1i)
filename = 'CELL_TYPES_WANSLEEBEN_HOGAN_2013'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
print(shape(genesets)[1])
# ROW INDEX
meta = 'META_CELL_TYPE'
FLATUI_PLOT = FLATUI_METACELL
title = os.path.basename(path_to_genesets)[:-4]
# GENERATE MATRIX OF CUMULATIVE EXPRESION PER CELL TYPE
exec('QUERY = INDF_{}'.format(subset_type))
titles = list(genesets.columns)#['EPITHELIAL','MESENCHYAL','CD45']
tmp = pd.DataFrame(index = QUERY.index)
for title in titles:
genes = genesets.values[:,np.where(genesets.columns==title)[0][0]] # genes x pathways
genes = [x for x in genes if str(x) != 'nan']
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
exec('tmp[\'{}_SCORE\'] = (np.nansum(vals,axis=1))'.format(title))
genes = list(tmp.columns)
# ROW COLORS
ind = QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind).map(lut)
# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(tmp.values,axis=0),columns = genes)
yticks = heatmap_data.index
xticks = heatmap_data.columns
# LINKAGE
method = 'average' # average, single
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(18,10))
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.088,0.1,0.01,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
pos = (x, y / len(r1))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.01,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().set_ticks([])
# SAVE FIGURE
figure_label = 'heatmaps/user_defined/_{}_{}_HEATMAP_{}_{}_{}'.format(meta,subset_type,title,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
d = os.path.dirname(fn)
if not os.path.exists(d):
os.makedirs(d)
fig.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SPECIFY SUBSET
subset_type = 'ALL'
dimtype = 'IPC_TSNE' # TSNE COMPUTED ON PRINCIPAL COMPONENTS ON THE INDF
dot_size = 1
exec('DIM = DIMENSIONS_{}'.format(subset_type))
# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes.
dot_size = 1
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)
# SAMPLE ID: LEGEND
ax = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('Legend'),
cmap=CM_SAMPLES,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
# METACELL TYPE ASSIGNMENT: LYMPHOID/MYELOID/OTHER(STROMA/EPITHELIUM)
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('META_CELL_TYPE'),
cmap=CM_METACELL,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('META_CELL_TYPE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
# SOURCE: NORMAL/PRIMARY TUMOR/ETASTASIS
ax = plt.subplot(gs1[2])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('Meta-Source'),
cmap= CM_SOURCE,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax,randomize=True);
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('SOURCE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
# SAVE FIGURE
figure_label = '_{}_META_CELL_TYPE_AND_SOURCE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
h5_data.ls()
#DF_EPITHELIAL_NOR_TUMOR_MET.shape
# SPECIFY DATA SUBSET
subset_type = 'MYELOID_OTHER_ALL'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
dimtype = 'IPC_TSNE' # TSNE COMPUTED ON PRINCIPAL COMPONENTS ON THE INDF
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes.
dot_size = 2
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)
# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'),
cmap=CM_SAMPLES,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()
# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
numeric_labels = [int(label[:-14]) for label in QUERY.index.get_level_values('PHENOGRAPH_CLASS')]
g = seqc.plot.scatter.categorical(x, y, c=numeric_labels,
cmap= CM_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS,
legend_kwargs={'ncol': 3, 'borderaxespad':0.,
'bbox_to_anchor':(0.75, 0)},
s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')
# CELL TYPE ASSIGNMENTS
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'),
cmap= CM_CELL_TYPE_MYELOID_OTHER_ALL,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')
# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL
# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST']
marker_genes_dict = {
'EPITHELIAL': ['CDH1','SOX9','SOX2'],
'STROMA': ['VIM', 'ACTA2','DES','PDGFRA','PDGFRB','CDH5','PECAM1','CD34'],
'MONOCYTE': ['FCGR3A','ITGAX','CD14'],
'DENDRITIC':['CD40','CD80','HLA-DOA', 'HLA-DOB','HLA-DQA1','HLA-DQA2','HLA-DQB1'],
'MHC2': ['HLA-DPA1','HLA-DPB1', 'HLA-DQA1','HLA-DQA2','HLA-DQB1', 'HLA-DRA','HLA-DRB1'],
'MACROPHAGE': ['LAMP2','LGALS3','MRC1','TFRC','FCGR2A','ITGAM','CD163'],
'NEUTROPHIL': ['LY6G6D','S100A9','S100A8','S100A12','PILRA','LILRB2','ISG15'],
'MDSC':['CYBB','CSF1R','STAT1','CD14','FCGR1A','FCGR1B','CD33','CCR1'],
'MAST':['KIT'] #,'ENPP3'
}
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['EPITHELIAL','STROMA','MONOCYTE','NEUTROPHIL','MDSC','MACROPHAGE','DENDRITIC','MAST']
# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
genes = marker_genes_dict[gene_group]
genes = [gene for gene in genes if gene in QUERY.columns]
labels = [gene_group] * len(genes)
current = pd.DataFrame({'labels':labels,'genes':genes})
tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values
var_group_positions = []
for gene_group in var_group_labels:
cpositions = list(tmp[tmp['labels'] ==gene_group].index)
var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))
# DOT PLOT OF GENES LABELED BY CATEGORY
ax = dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
save=None)
# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL
# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))
# CONVERT HEX TO RGB
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)
# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
yticks = heatmap_data.index
xticks = heatmap_data.columns
# COLOR ROWS BY <meta>
ind = list(heatmap_data.index)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# LINKAGE
method = 'average'
metric = 'correlation'
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(12,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6])
x = 0
y = 0
for c in row_colors[list(mat.index)]:
pos = (x, y / len(mat.index))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6])
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)
# SAVE FIGURE
figure_label = '_{}_matrixPlotMedian_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL
exec('QUERY = INDF_{}'.format(subset_type))
plot_order = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST']
genes = marker_genes
# GENERATE COLORMAP FOR ROWS (COLORED BY <meta>)
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0),
'gene':violin_data.index.get_level_values(1),
'Expression':violin_data.values})
# (0,0) Plot violin plot
N = len(plot_order)
# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (8,N*1))
sns.set_style("white")
for iii,cell_type in enumerate(plot_order):
ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type],
inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1,
scale = 'count',color = palette[cell_type],ax = ax1)
g.set_ylabel(" ")
g.set_xlabel(" ")
g.tick_params(axis='y',labelsize=8, rotation = 0)
plt.ylim(0,10)
plt.title("{}".format(cell_type.title()),fontsize=8,fontname = 'Arial',loc = 'left',
rotation = 'horizontal', ha = 'left', pad = 1.5)
ax1.spines['top'].set_linewidth(0.5)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_linewidth(0.5)
ax1.spines['bottom'].set_linewidth(0.5)
ax1.spines['left'].set_linewidth(0.5)
ax1.spines['right'].set_visible(False)
if iii != (N-1):
ax1.tick_params(axis='x', bottom=False, labelbottom=False)
else:
ax1.tick_params(axis='x', labelsize=8, rotation = 90)
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
gene_name = 'CDH1'
exec('QUERY = INDF_{}'.format(subset_type))
plot_order = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST']
# GENERATE COLORMAP
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0),
'gene':violin_data.index.get_level_values(1),
'Expression':violin_data.values})
gene_data = violin_data[violin_data['gene']==gene_name]
row_order = list(gene_data.groupby(meta).median().sort_values(by=['Expression']).index)
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(gene_data, row=meta,
hue = meta,aspect=10, size = 0.30,row_order = row_order,
palette=palette,col_wrap=None, sharex=True, sharey=True)
# Draw the densities in a few steps
g.map(sns.kdeplot, "Expression", clip_on=False, shade=True, alpha=1, lw=1, bw=.2, kernel = 'gau')
g.map(sns.kdeplot, "Expression", clip_on=False, color="w", lw=1, bw=.2, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, 0.1, label,color=color,fontsize=8,
ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Expression")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.75)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
plt.xlabel('KDE({} expression)'.format(gene_name.title()),fontsize=10)
# SAVE FIGURE
figure_label = '_{}_stackedKDE_{}_by{}'.format(subset_type,gene_name,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# DMAP (ALREADY LOG TRANSFORMED)
dmap = pd.read_table(os.path.expanduser(DATA_PATH+'DMap_data.gct'),
skiprows=2, parse_dates=False, index_col=[0, 1]).T # cells x genes
dmap.columns = dmap.columns.droplevel(0)
dmap = dmap.groupby(level=0, axis=1).mean() # average over duplicate genes
# average over dmap replicates
dmap.index = [re.sub('_\d{1,2}', '', i) for i in dmap.index]
dmap = dmap.groupby(level=0, axis=0).mean()
# z-normalize
dmap = pd.DataFrame(data = zscore(dmap,axis=0), columns = dmap.columns, index = dmap.index)
# GARVAN (NOT YET LOG TRANSFORMED)
garvan = pd.read_csv(os.path.expanduser(DATA_PATH+'garvan_immune_nonorm.csv'),
index_col=0)
# average over garvan replicates
garvan.columns = [re.sub('_\d', '', i) for i in garvan.columns]
garvan = garvan.groupby(level=0, axis=1).mean()# genes x cell types
# transpose and log transform data
garvan = np.log2(garvan + 1).T # cell types x genes
# z-normalize
garvan = pd.DataFrame(data = zscore(garvan,axis=0), columns = garvan.columns, index = garvan.index)
# Drop irrelevant cell types for myeloid/other compartment
dmap = dmap.drop(['BASO1', 'EOS2', 'ERY1', 'ERY2', 'ERY3', 'ERY4', 'ERY5', 'GMP', 'CMP',
'GRAN1', 'GRAN2', 'GRAN3', 'HSC1', 'HSC3', 'MEGA1', 'MEGA2', 'MEP',
'NKA1','NKA2','NKA3','NKA4',
'TCELLA2','TCELLA3','TCELLA4','TCELLA6','TCELLA7','TCELLA8','TCELLA1',
'PRE_BCELL2','PRE_BCELL3','BCELLA1','BCELLA2','BCELLA3','BCELLA4'])#
garvan = garvan.drop(['NK','bcells','central_memory_T_cells','effector_memory_T_cells',
'eosinophils_PMAh','eosinophils_control_rep2','th1_cells','basophils','eosinophils',
'th2_cells'])
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL WITH HIGH DISPERSION
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL
exec('QUERY = NDF_{}'.format(subset_type))
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 1,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 5)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>biological_variance.mean()
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
retain_goi = list(retain_bool[retain_bool].index)
print(len(retain_goi))
# IDENTIFY VARIABLE GENES ALSO DETECTED IN BULK MRNA DATA
expressed = list(set(retain_goi).intersection(dmap.columns).intersection(garvan.columns))
print(len(expressed))
# SELECT INTERSECTING SINGLE CELL VARIABLE/EXPRESSED GENES IN BULK DATASETS
exec('QUERY = np.log2(INDF_{}+1)'.format(subset_type))# LOG TRANSFORM FOR COMPARISON TO LOG TRANSFORMED BULK DATA
centroid_data = pd.DataFrame(data = zscore(QUERY[expressed].values,axis=0),columns = expressed, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
bulk_sigs = dmap[expressed].append(garvan[expressed])
# COLORMAP LUT BASED ON QUERY & MET
ind = QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
### CORRELATION BETWEEN CENTROIDS ####
rvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
pvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
for centroid_ind, centroid_ind_name in enumerate(centroid_data.index):
for bulk_ind,val in enumerate(bulk_sigs.index):
m = centroid_data.ix[centroid_ind]
h = bulk_sigs.ix[val]
# Linear Regression
lr = linregress(m, h)
slope = lr[0]
intercept = lr[1]
rvalue = lr[2]
pvalue = lr[3]
stderr = lr[4]
rvalues[centroid_ind,bulk_ind] = rvalue
pvalues[centroid_ind,bulk_ind] = pvalue
# Generate Dataframe
rvalues = pd.DataFrame(
data=rvalues,
columns=list(bulk_sigs.index),
index=centroid_data.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'
pvalues = pd.DataFrame(
data=pvalues,
columns=list(bulk_sigs.index),
index=centroid_data.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'
# ROW COLORS
ind = list(rvalues.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)
# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average'
metric = 'euclidean'
linkage = hc.linkage(rvalues, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(rvalues.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = rvalues.ix[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(3,round(rvalues.shape[0]/3)))
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.125,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
pos = (x, y / len(r1))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
minplot = 0.10
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.16,0.1,0.7,0.6])
im = axmatrix.matshow(mat[(pvalues < 0.01)*(np.abs(mat)>minplot)], aspect='auto',
origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
plt.setp(axmatrix.spines.values(), linewidth=0.5)
# ADD COLORBAR
axcolor = fig.add_axes([0.16,0.71,0.7,0.03])
cbar = plt.colorbar(im, cax=axcolor, orientation = 'horizontal')
cbar.ax.xaxis.set_ticks_position('top')
cbar.outline.set_visible(False)
# ADD DENDROGRAM
matplotlib.rcParams['lines.linewidth'] = 0.5
sch.set_link_color_palette(['#000000', '#000000', '#000000', '#000000','#000000','#000000','#000000'])
ax2 = fig.add_axes([0.045,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#000000')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# SAVE FIGURE
figure_label = '_{}_CORRELATION_{}_DBGAPandGARVAN_{}_{}'.format(subset_type,meta,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SPECIFY DATA SUBSET
subset_type = 'LYMPHOID_ALL'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
subset_type = 'LYMPHOID_ALL'
dimtype = 'RLS_IPC_TSNE' # TSNE COMPUTED ON PRINCIPAL COMPONENTS(PCS) ON THE INDF AFTER REGRESSION OF LIBRARY SIZE FROM PCS
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes.
dot_size = 2
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)
# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'),
cmap=CM_SAMPLES,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()
# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
numeric_labels = [int(label[:-9]) for label in QUERY.index.get_level_values('PHENOGRAPH_CLASS')]
g = seqc.plot.scatter.categorical(x, y, c=numeric_labels,
cmap= CM_LYMPHOID_ALL_PHENOGRAPH_CLASS,
legend_kwargs={'ncol': 3, 'borderaxespad':0.,
'bbox_to_anchor':(0.75, 0)},
s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')
# CELL TYPE ASSIGNMENTS
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'),
cmap= CM_CELL_TYPE_LYMPHOID_ALL,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')
# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL
# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['Breg', 'IG', 'NK', 'NKT', 'Treg','Th', 'Tm']
marker_genes_dict = {
'B_CELL': ['CD79A','MS4A1'],
'PLASMA_CELL': ['IGHM','IGHA1','IGHG1','IGHG2','IGHG3','IGHG4'],
'T_CELL': ['CD2','CD3D','CD3E','CD3G','CD8A','CD8B','STAT4',
'CTLA4','IFNG','IL7R','CD40LG'],
'NK_CELL':['NKG7','GNLY','NCR3','CD16','KIR33DL1','KLRD1']
}
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['B_CELL','PLASMA_CELL','NK_CELL','T_CELL']
# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
genes = marker_genes_dict[gene_group]
genes = [gene for gene in genes if gene in QUERY.columns]
labels = [gene_group] * len(genes)
current = pd.DataFrame({'labels':labels,'genes':genes})
tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values
var_group_positions = []
for gene_group in var_group_labels:
cpositions = list(tmp[tmp['labels'] ==gene_group].index)
var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))
# DOT PLOT OF GENES LABELED BY CATEGORY
ax = dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
save=None)
# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta= 'CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL
# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))
# CONVERT HEX TO RGB
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)
# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
if meta=='PHENOGRAPH_CLASS':
new_ind = [int(name[:-9]) for name in heatmap_data.index]
heatmap_data = pd.DataFrame(data = heatmap_data.values, index = new_ind, columns = heatmap_data.columns)
yticks = heatmap_data.index
xticks = heatmap_data.columns
# Palatte for Class METADATA
ind = list(heatmap_data.index) # CLUSTERS
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# LINKAGE
method = 'average'
metric = 'correlation'
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(10,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[list(mat.index)]:
pos = (x, y / len(mat.index))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
plt.axis('off')
# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)
# SAVE FIGURE
figure_label = '_{}_matrixPlotMedian_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL
exec('QUERY = INDF_{}'.format(subset_type))
plot_order = ['Breg', 'IG', 'NK', 'NKT', 'Treg','Th', 'Tm']
genes = marker_genes
# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0),
'gene':violin_data.index.get_level_values(1),
'Expression':violin_data.values})
# (0,0) Plot violin plot
N = len(plot_order)
# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (3,N*1))
sns.set_style("white")
for iii,cell_type in enumerate(plot_order):
ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type],
inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1,
scale = 'count',color = palette[cell_type],ax = ax1)
g.set_ylabel(" ")
g.set_xlabel(" ")
g.tick_params(axis='y',labelsize=8, rotation = 0)
plt.ylim(0,10)
plt.title("{}".format(cell_type.title()),fontsize=8,fontname = 'Arial',loc = 'left',
rotation = 'horizontal', ha = 'left', pad = 1.5)
ax1.spines['top'].set_linewidth(0.5)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_linewidth(0.5)
ax1.spines['bottom'].set_linewidth(0.5)
ax1.spines['left'].set_linewidth(0.5)
ax1.spines['right'].set_visible(False)
if iii != (N-1):
ax1.tick_params(axis='x', bottom=False, labelbottom=False)
else:
ax1.tick_params(axis='x', labelsize=8, rotation = 90)
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
gene_name = 'NKG7'
exec('QUERY = INDF_{}'.format(subset_type))
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL
# COLOR ROWS BY <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0),
'gene':violin_data.index.get_level_values(1),
'Expression':violin_data.values})
gene_data = violin_data[violin_data['gene']==gene_name]
row_order = list(gene_data.groupby(meta).median().sort_values(by=['Expression']).index)
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(gene_data, row=meta,
hue = meta,aspect=10, size = 0.30,row_order = row_order,
palette=palette,col_wrap=None, sharex=True, sharey=True)
# Draw the densities in a few steps
g.map(sns.kdeplot, "Expression", clip_on=False, shade=True, alpha=1, lw=1, bw=.2, kernel = 'gau')
g.map(sns.kdeplot, "Expression", clip_on=False, color="w", lw=1, bw=.2, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, 0.1, label,color=color,fontsize=8,
ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Expression")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.75)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
plt.xlabel('KDE({} expression)'.format(gene_name.title()),fontsize=10)
# SAVE FIGURE
figure_label = '_{}_stackedKDE_{}_by{}'.format(subset_type,gene_name,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
#### LOAD BULK MRNA DATA ####
# DMAP (ALREADY LOG TRANSFORMED)
dmap = pd.read_table(os.path.expanduser(DATA_PATH+'DMap_data.gct'),
skiprows=2, parse_dates=False, index_col=[0, 1]).T # cells x genes
dmap.columns = dmap.columns.droplevel(0)
dmap = dmap.groupby(level=0, axis=1).mean() # average over duplicate genes
# average over dmap replicates
dmap.index = [re.sub('_\d{1,2}', '', i) for i in dmap.index]
dmap = dmap.groupby(level=0, axis=0).mean()
# z-normalize
dmap = pd.DataFrame(data = zscore(dmap,axis=0), columns = dmap.columns, index = dmap.index)
# GARVAN (NOT YET LOG TRANSFORMED)
garvan = pd.read_csv(os.path.expanduser(DATA_PATH+'garvan_immune_nonorm.csv'),
index_col=0)
# average over garvan replicates
garvan.columns = [re.sub('_\d', '', i) for i in garvan.columns]
garvan = garvan.groupby(level=0, axis=1).mean()# genes x cell types
# transpose and log transform data
garvan = np.log2(garvan + 1).T # cell types x genes
# z-normalize
garvan = pd.DataFrame(data = zscore(garvan,axis=0), columns = garvan.columns, index = garvan.index)
# Drop irrelevant cell types for myeloid/other compartment
dmap = dmap.drop(['BASO1', 'EOS2', 'ERY1', 'ERY2', 'ERY3',
'ERY4', 'ERY5', 'GMP', 'GRAN1', 'GRAN2',
'GRAN3', 'HSC1', 'HSC3', 'MEGA1', 'MEGA2',
'MEP', 'PRE_BCELL2', 'PRE_BCELL2',
'PRE_BCELL3','CMP','DENDA1','DENDA2',
'GMP','HSC1','HSC3','MONO1','MONO2'], axis=0)
garvan = garvan.drop(['basophils','dc', 'dc_LPS8h','eosinophils', 'eosinophils_PMAh',
'eosinophils_control_rep2', 'macrophages', 'macrophages_LPSh',
'mast_cells', 'mast_cells_IgEh', 'neutrophils', 'neutrophils_LPSh'], axis=0)
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
meta= 'CELL_TYPE'
FLATUI_PLOT =FLATUI_CELL_TYPE_LYMPHOID_ALL
exec('QUERY = NDF_{}'.format(subset_type))
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 1,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 5)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
retain_goi = list(retain_bool[retain_bool].index)
print(len(retain_goi))
# IDENTIFY VARIABLE GENES ALSO DETECTED IN BULK MRNA DATA
expressed = list(set(retain_goi).intersection(dmap.columns).intersection(garvan.columns))
print(len(expressed))
# SELECT INTERSECTING SINGLE CELL VARIABLE/EXPRESSED GENES IN BULK DATASETS
exec('QUERY = np.log2(INDF_{}+1)'.format(subset_type))# LOG TRANSFORM FOR COMPARISON TO LOG TRANSFORMED BULK DATA
centroid_data = pd.DataFrame(data = zscore(QUERY[expressed].values,axis=0),columns = expressed, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
bulk_sigs = dmap[expressed].append(garvan[expressed])
# COLORMAP LUT BASED ON QUERY & MET
ind = centroid_data.index
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
### CORRELATION BETWEEN CENTROIDS ####
rvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
pvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
for centroid_ind, centroid_ind_name in enumerate(centroid_data.index):
for bulk_ind,val in enumerate(bulk_sigs.index):
m = centroid_data.ix[centroid_ind]
h = bulk_sigs.ix[val]
# Linear Regression
lr = linregress(m, h)
slope = lr[0]
intercept = lr[1]
rvalue = lr[2]
pvalue = lr[3]
stderr = lr[4]
rvalues[centroid_ind,bulk_ind] = rvalue
pvalues[centroid_ind,bulk_ind] = pvalue
# Generate Dataframe
rvalues = pd.DataFrame(
data=rvalues,
columns=list(bulk_sigs.index),
index=centroid_data.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'
pvalues = pd.DataFrame(
data=pvalues,
columns=list(bulk_sigs.index),
index=centroid_data.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'
# ROW COLORS
ind = list(rvalues.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)
# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average'
metric = 'euclidean'
linkage = hc.linkage(rvalues, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(rvalues.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = rvalues.ix[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(5,round(rvalues.shape[0]/3)))
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.125,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
pos = (x, y / len(r1))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.16,0.1,0.7,0.6])
im = axmatrix.matshow(mat[pvalues < 0.01], aspect='auto',
origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
plt.setp(axmatrix.spines.values(), linewidth=0.5)
# ADD COLORBAR
axcolor = fig.add_axes([0.16,0.71,0.7,0.03])
cbar = plt.colorbar(im, cax=axcolor, orientation = 'horizontal')
cbar.ax.xaxis.set_ticks_position('top')
cbar.outline.set_visible(False)
# ADD DENDROGRAM
matplotlib.rcParams['lines.linewidth'] = 0.5
sch.set_link_color_palette(['#000000', '#000000', '#000000', '#000000','#000000','#000000','#000000'])
ax2 = fig.add_axes([0.045,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#000000')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# SAVE FIGURE
figure_label = '_{}_CORRELATION_{}_DBGAPandGARVAN_{}_{}'.format(subset_type,meta,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
Cell type assignments were mapped directly back to the global dataset, using clusters annotated in MYELOID/OTHER and LYMPHOID analysis above. The index, 'PHENOGRAPH_CLASS' now also includes the phenograph class ID generated within each of the respective data subsets.
GENE_RANK_MAST for the 'ALL' subset was run for each cell type vs. all other cells in this complete merged atlas.
# SPECIFY SUBSET
subset_type = 'ALL'
dimtype = 'IPC_TSNE'
dot_size = 1
exec('DIM = DIMENSIONS_{}'.format(subset_type))
# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes.
dot_size = 1
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)
# SAMPLE ID
ax = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('Legend'),
cmap=CM_SAMPLES,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
# METACELL TYPE ASSIGNMENT
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('META_CELL_TYPE'),
cmap=CM_METACELL,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('META_CELL_TYPE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
# SOURCE: NORMAL VS. PRIMARY TUMOR VS. METASTASIS
ax = plt.subplot(gs1[2])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('CELL_TYPE'),
cmap= CM_CELL_TYPE,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax,randomize=True);
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('SOURCE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
figure_label = '_{}_LEGEND_METACELLTYPE_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE
# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST',
'Breg', 'IG', 'NK', 'NKT', 'Treg','Th', 'Tm']
marker_genes_dict = {
'EPITHELIAL': ['CDH1','SOX9','SOX2'],
'STROMA': ['VIM', 'ACTA2','DES','PDGFRA','PDGFRB','CDH5','PECAM1','CD34'],
'MONOCYTE': ['FCGR3A','ITGAX','CD14'],
'DENDRITIC':['CD40','CD80','HLA-DOA', 'HLA-DOB','HLA-DQA1','HLA-DQA2','HLA-DQB1'],
'MHC2': ['HLA-DPA1','HLA-DPB1', 'HLA-DQA1','HLA-DQA2','HLA-DQB1', 'HLA-DRA','HLA-DRB1'],
'MACROPHAGE': ['LAMP2','LGALS3','MRC1','TFRC','FCGR2A','ITGAM','CD163'],
'NEUTROPHIL': ['LY6G6D','S100A9','S100A8','S100A12','PILRA','LILRB2','ISG15'],
'MDSC':['CYBB','CSF1R','STAT1','CD14','FCGR1A','FCGR1B','CD33','CCR1'],
'MAST':['KIT'],
'B_CELL': ['CD79A','MS4A1'],
'PLASMA_CELL': ['IGHM','IGHA1','IGHG1','IGHG2','IGHG3','IGHG4'],
'T_CELL': ['CD2','CD3D','CD3E','CD3G','CD8A','CD8B','STAT4',
'CTLA4','IFNG','IL7R','CD40LG'],
'NK_CELL':['NKG7','GNLY','NCR3','CD16','KIR33DL1','KLRD1'],
'IMMUNE':['PTPRC']
}
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['EPITHELIAL','STROMA','MONOCYTE','NEUTROPHIL','MDSC','MACROPHAGE','DENDRITIC','MAST','B_CELL','PLASMA_CELL','NK_CELL','T_CELL','IMMUNE']
# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
genes = marker_genes_dict[gene_group]
genes = [gene for gene in genes if gene in QUERY.columns]
labels = [gene_group] * len(genes)
current = pd.DataFrame({'labels':labels,'genes':genes})
tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values
var_group_positions = []
for gene_group in var_group_labels:
cpositions = list(tmp[tmp['labels'] ==gene_group].index)
var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))
# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
save=None)
# COLORMAP LUT BASED ON QUERY & MET
ind = categories #QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE
# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))
# CONVERT HEX TO RGB
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)
# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
yticks = heatmap_data.index
xticks = heatmap_data.columns
# Palatte for Class METADATA
ind = list(heatmap_data.index) # CLUSTERS
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# LINKAGE
method = 'average' # average, single
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(15,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[list(mat.index)]:
pos = (x, y / len(mat.index))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)
# SAVE FIGURE
figure_label = '_{}_matrixPlo`btMedian_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SAVE GENE ORDER FOR STACKED VIOLIN PLOT
global_marker_gene_order = mat.columns
#### LOAD BULK MRNA DATA ####
# DMAP (ALREADY LOG TRANSFORMED)
dmap = pd.read_table(os.path.expanduser(DATA_PATH+'DMap_data.gct'),
skiprows=2, parse_dates=False, index_col=[0, 1]).T # cells x genes
dmap.columns = dmap.columns.droplevel(0)
dmap = dmap.groupby(level=0, axis=1).mean() # average over duplicate genes
# average over dmap replicates
dmap.index = [re.sub('_\d{1,2}', '', i) for i in dmap.index]
dmap = dmap.groupby(level=0, axis=0).mean()
# z-normalize
dmap = pd.DataFrame(data = zscore(dmap,axis=0), columns = dmap.columns, index = dmap.index)
# GARVAN (NOT YET LOG TRANSFORMED)
garvan = pd.read_csv(os.path.expanduser(DATA_PATH+'garvan_immune_nonorm.csv'),
index_col=0)
# average over garvan replicates
garvan.columns = [re.sub('_\d', '', i) for i in garvan.columns]
garvan = garvan.groupby(level=0, axis=1).mean()# genes x cell types
# transpose and log transform data
garvan = np.log2(garvan + 1).T # cell types x genes
# z-normalize
garvan = pd.DataFrame(data = zscore(garvan,axis=0), columns = garvan.columns, index = garvan.index)
# Drop irrelevant cell types for myeloid/other compartment
dmap = dmap.drop(['BASO1', 'EOS2', 'ERY1', 'ERY2', 'ERY3',
'ERY4', 'ERY5', 'GMP', 'GRAN1', 'GRAN2',
'GRAN3', 'HSC1', 'HSC3', 'MEGA1', 'MEGA2',
'MEP', 'PRE_BCELL2', 'PRE_BCELL2',
'PRE_BCELL3','CMP','GMP','HSC1','HSC3'], axis=0)
garvan = garvan.drop(['basophils','eosinophils', 'eosinophils_PMAh',
'eosinophils_control_rep2'], axis=0)
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
meta= 'CELL_TYPE'
FLATUI_PLOT =FLATUI_CELL_TYPE
exec('QUERY = NDF_{}'.format(subset_type))
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 1,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 5)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
retain_goi = list(retain_bool[retain_bool].index)
print(len(retain_goi))
# IDENTIFY VARIABLE GENES ALSO DETECTED IN BULK MRNA DATA
expressed = list(set(retain_goi).intersection(dmap.columns).intersection(garvan.columns))
print(len(expressed))
# SELECT INTERSECTING SINGLE CELL VARIABLE/EXPRESSED GENES IN BULK DATASETS
exec('QUERY = np.log2(INDF_{}+1)'.format(subset_type))# LOG TRANSFORM FOR COMPARISON TO LOG TRANSFORMED BULK DATA
centroid_data = pd.DataFrame(data = zscore(QUERY[expressed].values,axis=0),columns = expressed, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
bulk_sigs = dmap[expressed].append(garvan[expressed])
# COLORMAP LUT BASED ON QUERY & MET
ind = QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
### CORRELATION BETWEEN CENTROIDS ####
rvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
pvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
for centroid_ind, centroid_ind_name in enumerate(centroid_data.index):
for bulk_ind,val in enumerate(bulk_sigs.index):
m = centroid_data.ix[centroid_ind]
h = bulk_sigs.ix[val]
# Linear Regression
lr = linregress(m, h)
slope = lr[0]
intercept = lr[1]
rvalue = lr[2]
pvalue = lr[3]
stderr = lr[4]
rvalues[centroid_ind,bulk_ind] = rvalue
pvalues[centroid_ind,bulk_ind] = pvalue
# Generate Dataframe
rvalues = pd.DataFrame(
data=rvalues,
columns=list(bulk_sigs.index),
index=centroid_data.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'
pvalues = pd.DataFrame(
data=pvalues,
columns=list(bulk_sigs.index),
index=centroid_data.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'
# ROW COLORS
ind = list(rvalues.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)
# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average'
metric = 'euclidean'
linkage = hc.linkage(rvalues, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(rvalues.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = rvalues.ix[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(15,round(rvalues.shape[0]/3)))
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.125,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
pos = (x, y / len(r1))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
minplot = 0.20
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.16,0.1,0.7,0.6])
im = axmatrix.matshow(mat[(pvalues < 0.05)*(np.abs(mat)>minplot)], aspect='auto',
origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
plt.setp(axmatrix.spines.values(), linewidth=0.5)
# ADD COLORBAR
axcolor = fig.add_axes([0.16,0.71,0.7,0.03])
cbar = plt.colorbar(im, cax=axcolor, orientation = 'horizontal')
cbar.ax.xaxis.set_ticks_position('top')
cbar.outline.set_visible(False)
# ADD DENDROGRAM
matplotlib.rcParams['lines.linewidth'] = 0.5
sch.set_link_color_palette(['#000000', '#000000', '#000000', '#000000','#000000','#000000','#000000'])
ax2 = fig.add_axes([0.045,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#000000')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# SAVE FIGURE
figure_label = '_{}_CORRELATION_{}_DBGAPandGARVAN_{}_{}'.format(subset_type,meta,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
subset_type = 'ALL'
meta = 'CELL_TYPE'
cm_flatui = FLATUI_CELL_TYPE
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('tissue_cluster_sizes = QUERY.groupby(level=[\'Legend\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]
# BARPLOT SHOWING FRACTION CELL TYPE DETECTED PER PATIENT SAMPLE
plt.figure(figsize = (5,2))
fig.canvas.draw()
sns.set_style("white")
ax = plt.gca()
legend_kwargs={'ncol': 2, 'borderaxespad':0.,'bbox_to_anchor':(1, 0)},
r1 = ['RU675_NORMAL_1ANS_UTR','RU682_NORMAL_2AS_UTR','RU684_NORMAL_1AS_UTR','RU685_NORMAL_1ANS_UTR',
'RU653_TUMOR_1AS_UTR','RU661_TUMOR_1AS_UTR','RU675_TUMOR_1ANS_UTR','RU676_TUMOR_1AS_UTR',
'RU679_TUMOR_2AS_TR','RU680_TUMOR_1AS_UTR','RU682_TUMOR_2AS_UTR','RU684_TUMOR_1AS_UTR',
'RU255B_BRAINMET_4S_TR','RU666_SPINEMET_4NS_TR','RU681_BRAINMET_4S_TR','RU699_ADRENALMET_4','RU701_BRAINMET_4'][::-1]
tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0).ix[r1].plot.barh(stacked=True,
color=colors,
ax = ax,
width = 0.95)
plt.xticks(fontsize = 8)
plt.yticks(fontsize = 8)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.legend_.remove()
plt.tight_layout()
# SAVE FIGURE
figure_label = '_{}_{}perSample_barplot'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=fig_dpi, transparent=True)
print(fn)
# SOURCE, CELL TYPE, FRACTION
vals = list(tissue_cluster_sizes.index)
classes = []
for name in vals:
if 'MET' in name:
classes.append('METASTASIS')
elif 'NORMAL' in name:
classes.append('NORMAL')
else:
classes.append('PRIMARY')
tissue_cluster_sizes['META SOURCE'] = classes
tissue_cluster_sizes = tissue_cluster_sizes.set_index('META SOURCE')
tissue_cluster_fractions = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1), axis=0)
# FORMAT BOXPLOT DATA
boxplot_data = []
pseudocount = 0.0001
for ii in np.arange(tissue_cluster_fractions.shape[0]):
for jj in np.arange(tissue_cluster_fractions.shape[1]):
boxplot_data.append({'META SOURCE':tissue_cluster_fractions.index[ii],
'CELL TYPE':tissue_cluster_fractions.columns[jj],
'FRACTION':np.log10(tissue_cluster_fractions.iloc[ii,jj]+pseudocount)})
boxplot_data = pd.DataFrame(boxplot_data)
palette = {
'NORMAL': '#E6E6FA',
'PRIMARY': '#000000',
'METASTASIS': '#FF0000'
}
# GROUPED BOXPLOT WITH DATA POINTS OVERLAYED
fig = plt.figure(figsize = (20,6))
ax1 = plt.gca()
sns.set_style("white")
labels = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)',
'MICROGLIA/MACROPHAGE','NK', 'NKT','Th', 'Tm', 'Treg','Breg','IG','MAST']
g = sns.boxplot(x="CELL TYPE", y="FRACTION", hue="META SOURCE",data=boxplot_data, palette=palette,
hue_order = ['NORMAL','PRIMARY','METASTASIS'],
order = labels, fliersize = 4, showmeans=False,linewidth = 1)
g = sns.swarmplot(x="CELL TYPE", y="FRACTION", hue="META SOURCE",data=boxplot_data,
palette={'NORMAL': '#E6E6FA','PRIMARY': '#606060','METASTASIS': '#FF0000'},
size = 5,edgecolor='#000000', linewidth=1,
hue_order = ['NORMAL','PRIMARY','METASTASIS'],order = labels, split=True)
g.set_ylabel("\n\n\nLog10Fraction",fontsize=14,fontname = 'Arial')
g.set_xticklabels(labels,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=14)
sns.despine()
ax1.legend_.remove()
# SAVE FIGURE
figure_label = '{}_CELLTYPE_FRACTION_BY_SOURCE_BOXPLOT_LOG'.format(subset_type).replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
#The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal.
#It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes.
#Note that rejecting the null hypothesis does not indicate which of the groups differs.
#Post-hoc comparisons between groups are required to determine which groups are different.
# ANOVA BY CELL TYPE (assumes normal distribution)
query_types= []
for cell_query in list(tissue_cluster_fractions.columns):
# SUBSET DATA FROM CELL TYPE
tmp = tissue_cluster_fractions[cell_query]
# Perform the ANOVA
if (stats.kruskal(tmp['NORMAL'], tmp['PRIMARY'], tmp['METASTASIS'])[1] < 0.05):
print(cell_query)
print(stats.kruskal(tmp['NORMAL'], tmp['PRIMARY'], tmp['METASTASIS']))
query_types.append(cell_query)
print(query_types)
# RANGE OF CLUSTER SIZES
meta = 'CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE
exec('QUERY = INDF_{}'.format(subset_type))
cell_types = np.unique(QUERY.index.get_level_values(meta))
# To evaluate patient mixing within each cell type, we:
# (1) randomly sampling ncells per cell type
# (2) Compute the Shannon Diversity index based on sample ID within the sampled subset.
# (3) Repeat nreps times to achieve a range of entropy values per cell type.
ncells = 100 # randomly sample this number of cells within each cell type
nreps = 100 # repeat sampling this many times
# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# ENTROPY FOR CELL SUBSETS
H = pd.DataFrame()
for cell_type in cell_types:
# SUBSET CELLS BELONGING TO SPECIFIC CELL TYPE
CELL_TYPE_SUBSET = QUERY.select(lambda x: x[[ind for ind, name in enumerate(QUERY.index.names) if name==meta][0]] in [cell_type])
H_sample = []
for rep in np.arange(nreps):
# Randomly sample 100 Cells from cluster
sampling = CELL_TYPE_SUBSET.sample(n=ncells,replace=True)
# Compute probability of cells sampled for individual patients
labels = sampling.index.get_level_values('Legend')
probs = [np.mean(labels == c) for c in set(labels)]
# Compute entropy of distribution across patients
H_sample.append(np.sum(-p * np.log(p) for p in probs))
# Results for cell type
H = H.append(pd.DataFrame({'Entropy across patients': H_sample, 'Cell Type':[cell_type]*nreps}))
# VISUALIZE PATIENT ENTROPY BY RANKED BOXPLOT
fig = plt.figure(figsize = (8,3))
ax1 = plt.gca()
sns.set_style("white")
# Rank boxplots by entropy
plot_order = H.set_index('Cell Type').groupby('Cell Type').mean().sort_values(by=['Entropy across patients']).index
g = sns.boxplot(x="Cell Type", y="Entropy across patients",data=H, palette=palette,
fliersize = 4, showmeans=False,linewidth = 1, order = plot_order)
g.set_ylabel("Entropy across patients",fontsize=10,fontname = 'Arial')
g.set_xticklabels(plot_order,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=10)
sns.despine()
# SAVE FIGURE
figure_label = '{}_PATIENT_ENTROPY_PER_{}'.format(subset_type,meta).replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
In this section, we analyze at greater resolution the epithelial compartment to better understand the relationship between normal lung epithelial cell types and cancer; as well as tumor-cell intrinsic adaptations.
subset_type = 'EPITHELIAL_NOR'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
dimtype = 'ForceDirected' # we now use force directed layout for visualization, because anlayzing cell state heterogneiety within a given cell type
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes.
dot_size = 5
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)
# Sample ID
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'),
cmap=CM_NORMAL,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()
# Phenograph Cluster
ax2 = plt.subplot(gs1[1])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('PHENOGRAPH_CLASS'),
cmap= CM_PHENOGRAPH_CLASS_NORMAL,
legend_kwargs={'ncol': 3, 'borderaxespad':0.,
'bbox_to_anchor':(0.75, 0)},
s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')
# Cluster Assignment
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'),
cmap= CM_CLASS_NORMAL_LINEAGES,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')
# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CLASS_NORMAL_LINEAGES
# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['AE2','AE1','CLUB','CILIATED']
marker_genes_dict = {
'AEC2': ['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'],
'AEC1': ['PDPN','HOPX','AGER','CAV1','SEMA3B'],
'CLUB': ['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'],
'CILIATED':['SOX2','FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'],
}
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['AEC2','AEC1','CLUB','CILIATED']
# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
genes = marker_genes_dict[gene_group]
genes = [gene for gene in genes if gene in QUERY.columns]
labels = [gene_group] * len(genes)
current = pd.DataFrame({'labels':labels,'genes':genes})
tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values
var_group_positions = []
for gene_group in var_group_labels:
cpositions = list(tmp[tmp['labels'] ==gene_group].index)
var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))
# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
save=None)
# COLORMAP LUT BASED ON QUERY & MET
ind = categories
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CLASS_NORMAL_LINEAGES
# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))
# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)
# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
yticks = heatmap_data.index
xticks = heatmap_data.columns
# Palatte for Class METADATA
ind = list(heatmap_data.index) # CLUSTERS
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# LINKAGE
method = 'average' # average, single
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(15,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[list(mat.index)]:
pos = (x, y / len(mat.index))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
if y >= len(r1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)
# SAVE FIGURE
figure_label = '_{}_matrixPlotMedian_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SELECT TOP <top_cutoff> DE GENES BY CELL TYPE
top_cutoff = 60
meta = 'PHENOGRAPH_CLASS'
exec('class_names = [name for name in np.unique(GENE_RANK_MAST_{}.columns) if \'{}\' in name]'.format(subset_type,meta))
gene_array = pd.DataFrame()
for title in class_names:
exec('tmp = GENE_RANK_MAST_{}[title]'.format(subset_type))
tmp = tmp.sort_values(axis=0, ascending=False)
gene_subset = list(tmp.index)
strs = [title for x in range(len(gene_subset))]
cdf = pd.DataFrame({'Gene':gene_subset[:top_cutoff], meta: strs[:top_cutoff]})
gene_array = pd.concat([gene_array,cdf],axis=0)
# COLOR ROWS BY ANNOTATED CELL TYPE
exec('ind = INDF_{}.index.get_level_values(\'CELL_TYPE\')'.format(subset_type))
FLATUI_PLOT = FLATUI_PHENOGRAPH_CLASS_NORMAL
# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind).map(lut)
# COLUMN INDEX AND COLORS
genes = np.unique(gene_array['Gene'].values)
# CONSTRUCT HEATMAP DATA
exec('heatmap_data = pd.DataFrame(data = zscore(INDF_{}[genes].values,axis=0),columns = genes)'.format(subset_type))
yticks = heatmap_data.index
xticks = heatmap_data.columns
# LINKAGE
method = 'average' # average, single
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# CLUSTERED HEATMAP OF DE GENES BY SUBPOPULATION,Only showing DE
sns.set(font_scale=0.5)
g = sns.clustermap(heatmap_data,
row_linkage = row_linkage, col_linkage=col_linkage,
figsize = (20,18),linewidth=0,row_colors = row_colors,
cmap = plt.cm.RdBu_r,vmin = -3, vmax = 3)
ax = g.ax_heatmap
ax.set_yticklabels([]);
# Only label marker genes for simplified visual
labels = [item.get_text() for item in ax.get_xticklabels()]
for ind,label in enumerate(labels):
if label not in marker_genes:
labels[ind] = ''
h = ax.set_xticklabels(labels, rotation=90, fontsize=20);
# SAVE FIGURE
figure_label = '_{}_{}_HEATMAP_TOP{}_DE_GENES_{}_{}_INDF'.format(meta,subset_type,top_cutoff,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
g.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CLASS_NORMAL_LINEAGES
exec('QUERY = INDF_{}'.format(subset_type))
plot_order = ['AE2','AE1','CLUB','CILIATED']
genes = marker_genes
# COLOR ROWS BY <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0),
'gene':violin_data.index.get_level_values(1),
'Expression':violin_data.values})
# (0,0) Plot violin plot
N = len(plot_order)
# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (3,N*1))
sns.set_style("white")
for iii,cell_type in enumerate(plot_order):
ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type],
inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1,
scale = 'count',color = palette[cell_type],ax = ax1)
g.set_ylabel(" ")
g.set_xlabel(" ")
g.tick_params(axis='y',labelsize=8, rotation = 0)
plt.ylim(0,5)
plt.title("{}".format(cell_type.title()),fontsize=8,fontname = 'Arial',loc = 'left',
rotation = 'horizontal', ha = 'left', pad = 1.5)
ax1.spines['top'].set_linewidth(0.5)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_linewidth(0.5)
ax1.spines['bottom'].set_linewidth(0.5)
ax1.spines['left'].set_linewidth(0.5)
ax1.spines['right'].set_visible(False)
if iii != (N-1):
ax1.tick_params(axis='x', bottom=False, labelbottom=False)
else:
ax1.tick_params(axis='x', labelsize=8, rotation = 90)
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
meta = 'CELL_TYPE'
cm_flatui = FLATUI_CLASS_NORMAL_LINEAGES
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('tissue_cluster_sizes = QUERY.groupby(level=[\'Legend\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]
# BARPLOT SHOWING FRACTION CELL TYPE DETECTED PER PATIENT SAMPLE
plt.figure(figsize = (6,2))
ax = plt.gca()
legend_kwargs={'ncol': 2, 'borderaxespad':0.,'bbox_to_anchor':(1, 0)},
r1 = ['RU675_NORMAL_1ANS_UTR','RU682_NORMAL_2AS_UTR','RU684_NORMAL_1AS_UTR','RU685_NORMAL_1ANS_UTR'][::-1]
tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0).ix[r1].plot.barh(stacked=True,
color=colors,
ax = ax, width = 0.95)
plt.xticks(fontsize = 8)
plt.yticks(fontsize = 8)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.legend_.remove()
plt.tight_layout()
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
plt.tick_params(
axis='y', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
left=False, # ticks along the bottom edge are off
right=False, # ticks along the top edge are off
labelleft=True) # labels along the bottom edge are off
# SAVE FIGURE
figure_label = '_{}_{}perSample_barplot'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=fig_dpi, transparent=True)
print(fn)
subset_type = 'EPITHELIAL_NOR_TUMOR'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
dimtype = 'ForceDirected'
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes.
dot_size = 5
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)
# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'),
cmap=CM_CANCERandNORMAL,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()
# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('PHENOGRAPH_CLASS'),
cmap= CM_CLASS,
legend_kwargs={'ncol': 3, 'borderaxespad':0.,
'bbox_to_anchor':(0.75, 0)},
s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')
# CELL TYPE ASSIGNMENTS (STEM/PROGENITOR IS THE MIXED LINEAGE STATE)
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'),
cmap= CM_LINEAGE,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')
# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# LOAD GENESETS
# ADD REFERENCE TO SUPPLEMENTARY FILE
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets_include = ['AEP_MORRISEY_2FC','AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS']
lineage_genes = np.unique(genesets[genesets_include].values.ravel().tolist())
lut = {}
for geneset in genesets_include:
current_genes = [gene for gene in np.unique(genesets[geneset].values.ravel().tolist()) if gene != 'nan']
current_label = [geneset]*len(current_genes)
current_lut = dict(zip(current_genes,current_label))
lut = dict(lut, **current_lut)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_LINEAGE
# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR']
marker_genes_dict = {
'AEC2': ['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'],
'AEC1': ['PDPN','HOPX','AGER','CAV1','SEMA3B'],
'CLUB': ['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'],
'CILIATED':['FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'],
'BASAL':['KRT14', 'APOE','EMP3','APOC1','CD44'],
'NE': ['UCHL1','ASCL1','CALCA'],
'AEP': ['TM4SF1','MUC5B','LRP5'],
'PROGENITOR TF': ['SOX2','SOX9','ID2','NKX2-1']
}
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['AEC2','AEC1','CLUB','CILIATED','BASAL','NE','AEP','PROGENITOR TF']
# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
genes = marker_genes_dict[gene_group]
genes = [gene for gene in genes if gene in QUERY.columns]
labels = [gene_group] * len(genes)
current = pd.DataFrame({'labels':labels,'genes':genes})
tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values
var_group_positions = []
for gene_group in var_group_labels:
cpositions = list(tmp[tmp['labels'] ==gene_group].index)
var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))
# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
save=None)
# COLORMAP LUT BASED ON QUERY & MET
ind = categories
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_LINEAGE
exec('QUERY = INDF_{}'.format(subset_type))
plot_order = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR']
genes = marker_genes
# COLOR ROWS BY <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0),
'gene':violin_data.index.get_level_values(1),
'Expression':violin_data.values})
# (0,0) Plot violin plot
N = len(plot_order)
# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (4,N*0.80))
sns.set_style("white")
for iii,cell_type in enumerate(plot_order):
ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type],
inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1,
scale = 'count',color = palette[cell_type],ax = ax1)
g.set_ylabel(" ")
g.set_xlabel(" ")
g.tick_params(axis='y',labelsize=8, rotation = 0)
plt.ylim(0,10)
plt.title("{}".format(cell_type.title()),fontsize=6,fontname = 'Arial',loc = 'left',
rotation = 'horizontal', ha = 'left', pad = 1.5)
ax1.spines['top'].set_linewidth(0.5)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_linewidth(0.5)
ax1.spines['bottom'].set_linewidth(0.5)
ax1.spines['left'].set_linewidth(0.5)
ax1.spines['right'].set_visible(False)
if iii != (N-1):
ax1.tick_params(axis='x', bottom=False, labelbottom=False)
else:
ax1.tick_params(axis='x', labelsize=8, rotation = 90)
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
gene_name = 'TM4SF1'
exec('QUERY = INDF_{}'.format(subset_type))
# Color rows by <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0),
'gene':violin_data.index.get_level_values(1),
'Expression':violin_data.values})
gene_data = violin_data[violin_data['gene']==gene_name]
row_order = list(gene_data.groupby(meta).median().sort_values(by=['Expression']).index)
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(gene_data, row=meta,
hue = meta,aspect=10, size = 0.30,row_order = row_order,
palette=palette,col_wrap=None, sharex=True, sharey=True)
# Draw the densities in a few steps
g.map(sns.kdeplot, "Expression", clip_on=False, shade=True, alpha=1, lw=1, bw=.2, kernel = 'gau')
g.map(sns.kdeplot, "Expression", clip_on=False, color="w", lw=1, bw=.2, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, 0.1, label,color=color,fontsize=8,
ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Expression")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.75)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
plt.xlabel('KDE({} expression)'.format(gene_name.title()),fontsize=10)
# SAVE FIGURE
figure_label = '_{}_stackedKDE_{}_by{}'.format(subset_type,gene_name,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
meta = 'CELL_TYPE'
cm_flatui = FLATUI_LINEAGE
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('tissue_cluster_sizes = QUERY.groupby(level=[\'Legend\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]
# BARPLOT SHOWING FRACTION CELL TYPE DETECTED PER PATIENT SAMPLE
plt.figure(figsize = (3,2))
ax = plt.gca()
legend_kwargs={'ncol': 2, 'borderaxespad':0.,'bbox_to_anchor':(1, 0)},
r1 = ["RU675_NORMAL_1ANS_UTR", "RU682_NORMAL_2AS_UTR", "RU684_NORMAL_1AS_UTR","RU685_NORMAL_1ANS_UTR",
"RU653_TUMOR_1AS_UTR","RU661_TUMOR_1AS_UTR","RU675_TUMOR_1ANS_UTR","RU676_TUMOR_1AS_UTR",
"RU679_TUMOR_2AS_TR","RU680_TUMOR_1AS_UTR","RU682_TUMOR_2AS_UTR","RU684_TUMOR_1AS_UTR"][::-1]
tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0).ix[r1].plot.barh(stacked=True,
color=colors,
ax = ax,
width = 0.95)
plt.xticks(fontsize = 8)
plt.yticks(fontsize = 8)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.legend_.remove()
plt.tight_layout()
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
plt.tick_params(
axis='y', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
left=False, # ticks along the bottom edge are off
right=False, # ticks along the top edge are off
labelleft=True) # labels along the bottom edge are off
# SAVE FIGURE
figure_label = '_{}_{}perSample_barplot'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=fig_dpi, transparent=True)
print(fn)
# RANGE OF CLUSTER SIZES
meta = 'CELL_TYPE'
FLATUI_PLOT = FLATUI_LINEAGE
exec('QUERY = INDF_{}'.format(subset_type))
cell_types = np.unique(QUERY.index.get_level_values(meta))
# To evaluate patient mixing within each cell type, we:
# (1) randomly sampling ncells per cell type
# (2) Compute the Shannon Diversity index based on sample ID within the sampled subset.
# (3) Repeat nreps times to achieve a range of entropy values per cell type.
ncells = 100 # randomly sample this number of cells within each cell type
nreps = 20 # repeat sampling this many times, scaled to cover sample size.
# GENERATE COLORMAP
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# ENTROPY FOR CELL SUBSETS
H = pd.DataFrame()
for cell_type in cell_types:
# SUBSET CELLS BELONGING TO SPECIFIC CELL TYPE
CELL_TYPE_SUBSET = QUERY.select(lambda x: x[[ind for ind, name in enumerate(QUERY.index.names) if name==meta][0]] in [cell_type])
H_sample = []
for rep in np.arange(nreps):
# Randomly sample 100 Cells from cluster
sampling = CELL_TYPE_SUBSET.sample(n=ncells,replace=True)
# Compute probability of cells sampled for individual patients
labels = sampling.index.get_level_values('Legend')
probs = [np.mean(labels == c) for c in set(labels)]
# Compute entropy of distribution across patients
H_sample.append(np.sum(-p * np.log(p) for p in probs))
# Results for cell type
H = H.append(pd.DataFrame({'Entropy across patients': H_sample, 'Cell Type':[cell_type]*nreps}))
# RANKED KDE OF ENTROPY TO ACCOMPANY HEATMAP
#### VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(0.6,5))
H_row_order = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR']
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# Initialize the FacetGrid object
g = sns.FacetGrid(H, row='Cell Type', hue = 'Cell Type',aspect=3, size = 0.30,row_order = H_row_order,
palette=palette,col_wrap=None, sharex=True, sharey=True)
# Draw the densities in a few steps
g.map(sns.kdeplot, "Entropy across patients", clip_on=False, shade=True, alpha=1, lw=1, bw=.1, kernel = 'gau')
g.map(sns.kdeplot, "Entropy across patients", clip_on=False, color="w", lw=1, bw=.1, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, 0.1, label,color=color,fontsize=8,
ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Entropy across patients")
# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-0.25)
# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)
plt.xlabel('KDE(Entropy across patients)',fontsize=10)
# SAVE FIGURE
figure_label = '{}_PATIENT_ENTROPY_PER_{}_STACKED_VIOLINS'.format(subset_type,meta).replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
tissue_cluster_sizes = QUERY.groupby(level=['CELL_TYPE', 'Meta-Source'], axis=0).size().unstack().fillna(0)
# CONVERT HEX TO RGB (FLATUI_CLASS)
FLATUI_PLOT = ['E6E6FA', # NORMAL
'000000'] # PRIMARY
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]
plt.figure(figsize = (2,1))
ax = plt.gca()
D = tissue_cluster_sizes.reindex(['AEC2','AEC1', 'CLUB', 'CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR'])
D.div(D.sum(axis=1),axis=0).plot.bar(stacked=True, color=metacell_colors, ax = ax, width = 0.85)
ax.legend_.remove()
plt.axis('off')
# SAVE FIGURE
figure_label = '_{}Distribution_PerSource_{}_PathwayMerged_fraction'.format(meta,subset_type).replace('.','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# COMPUTED ON UN-IMPUTED, NORMALIZED DATA
# COLOR BY CELL_TYPE, SOURCE AND PHENOGRAPH CLASS
exec('ind = NDF_{}.index.get_level_values(\'CELL_TYPE\')'.format(subset_type))
exec('ind2 = NDF_{}.index.get_level_values(\'Meta-Source\')'.format(subset_type))
exec('ind3 = NDF_{}.index.get_level_values(\'PHENOGRAPH_CLASS\')'.format(subset_type))
# Palatte for Class METADATA
FLATUI_PLOT = FLATUI_LINEAGE
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors1 = pd.Series(ind).map(lut)
# Palatte for Class METADATA
FLATUI_SOURCE_RX = [
'E6E6FA', # NORMAL
'000000', # PRIMARY
]
FLATUI_PLOT = FLATUI_SOURCE_RX
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind2)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind2), colors[cix,:]))
row_colors2 = pd.Series(ind2).map(lut)
FLATUI_PLOT = FLATUI_CLASS
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind3)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind3), colors[cix,:]))
row_colors3 = pd.Series(ind3).map(lut)
# COLUMN INDEX AND COLORS
exec('genes = [gene for gene in marker_genes if gene in NDF_{}.columns]'.format(subset_type))
# CONSTRUCT HEATMAP DATA
exec('heatmap_data = pd.DataFrame(data = NDF_{}[genes].values,columns = genes, index = NDF_{}.index).fillna(0)'.format(subset_type,subset_type))
binary_heatmap_data = heatmap_data>heatmap_data.quantile(q = 0.75,axis=0)
binary_heatmap_data = binary_heatmap_data.astype(int)
method = 'average' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(binary_heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(binary_heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# CLUSTERED HEATMAP OF DE GENES BY SUBPOPULATION,Only showing DE
sns.set(font_scale=0.5)
g = sns.clustermap(binary_heatmap_data,
row_linkage = row_linkage,col_cluster=False,
linewidths=0.1,
figsize = (3,8),linewidth=0,row_colors=[row_colors3,row_colors2, row_colors1],
cmap = 'coolwarm')
ax = g.ax_heatmap
ax.set_yticklabels([]);
g.cax.set_visible(False)
h = ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=6);
# SAVE FIGURE
figure_label = '_{}_single_cell_lineageGeneBinary_heatmap_cellType_source_phenograph'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
g.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets_include = ['AEP_MORRISEY_2FC','AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS']
lineage_genes = genesets[genesets_include]
# CONSTRUCT HEATMAP DATA
exec('heatmap_data = NDF_{}.fillna(0)'.format(subset_type))
binary_heatmap_data = heatmap_data>heatmap_data.quantile(q = 0.75,axis=0)
binary_heatmap_data = binary_heatmap_data.astype(int)
#AEC2_markers = [gene for gene in lineage_genes['AEC2_ME18p5_GENES_LG'].values if gene not in [nan]]#['SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3']
AEC2_markers = set(['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'])
AEC2_markers = set([gene for gene in AEC2_markers if gene in heatmap_data.columns])
#AEC1_markers = [gene for gene in lineage_genes['AEC1_ME18p5_GENES_LG'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
AEC1_markers = set(['PDPN','HOPX','AGER','CAV1','SEMA3B'])
AEC1_markers = set([gene for gene in AEC1_markers if gene in heatmap_data.columns])
AEC1_markers = AEC1_markers-AEC2_markers
#club_markers = [gene for gene in lineage_genes['CLUB_ME18p5_GENES_LG'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
club_markers = set(['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'])
club_markers = set([gene for gene in club_markers if gene in heatmap_data.columns])
club_markers = club_markers-AEC1_markers-AEC2_markers
#ciliated_markers = [gene for gene in lineage_genes['CILIATED_ME18p5_GENES_LG'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
ciliated_markers = set(['FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'])#
ciliated_markers = set([gene for gene in ciliated_markers if gene in heatmap_data.columns])
ciliated_markers = ciliated_markers-club_markers-AEC1_markers-AEC2_markers
#basal_markers = [gene for gene in lineage_genes['LECTIN+KRT5+BASAL'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
basal_markers = set(['KRT14', 'KRT31', 'APOE','EMP3','APOC1','ARHGD1B','CD44'])
basal_markers = set([gene for gene in basal_markers if gene in heatmap_data.columns])
basal_markers = basal_markers-ciliated_markers-club_markers-AEC1_markers-AEC2_markers
#PNEC_markers = [gene for gene in lineage_genes['PNECS'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
PNEC_markers = set(['UCHL1','ASCL1','CALCA'])
PNEC_markers = set([gene for gene in PNEC_markers if gene in heatmap_data.columns])
PNEC_markers = PNEC_markers-basal_markers-ciliated_markers-club_markers-AEC1_markers-AEC2_markers
#AEP_markers = [gene for gene in lineage_genes['AEP_MORRISEY_2FC'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
AEP_markers = set(['TM4SF1','MUC5B','LRP5'])
AEP_markers = set([gene for gene in AEP_markers if gene in heatmap_data.columns])
AEP_markers = AEP_markers-PNEC_markers-basal_markers-ciliated_markers-club_markers-AEC1_markers-AEC2_markers
all_markers = list(AEC2_markers)+list(AEC1_markers)+list(club_markers)+list(ciliated_markers)+list(basal_markers)+list(PNEC_markers)+list(AEP_markers)
binary_heatmap_data = binary_heatmap_data[all_markers]
a1 = {key : 'AEC1' for key in AEC1_markers}
a2 = {key : 'AEC2' for key in AEC2_markers}
a3 = {key : 'CLUB' for key in club_markers}
a4 = {key : 'CILIATED' for key in ciliated_markers}
a5 = {key : 'BASAL' for key in basal_markers}
a6 = {key : 'NE' for key in PNEC_markers}
a7 = {key : 'AEP' for key in AEP_markers}
inverted_marker_genes_dict = {**a1, **a2,**a3,**a4,**a5,**a6,**a7}
print(np.unique(binary_heatmap_data.index.get_level_values('CELL_TYPE')))
lineage_color_lut = {'AEC1':'#FF66FF',
'AEC2':'#6600CC',
'AEP':'#9ACD32',
'BASAL':'#0000FF',
'CILIATED':'#3399FF',
'CLUB':'#FF8800',
'NE':'#FFD700',
'PROGENITOR TF':'#E6E6FA'}
# GENERATE AXIS PER VIOLIN PLOT
fraction_cell_type_array = pd.DataFrame()
for cell_index in np.arange(binary_heatmap_data.shape[0]):
current_cell = binary_heatmap_data.ix[cell_index,:]
detected_genes = current_cell[current_cell==1].index
detected_cell_types = list(detected_genes.map(inverted_marker_genes_dict))
fraction_cell_type = pd.DataFrame(columns = ['Cell Type', 'Gene Counts'])
for cell_type in np.unique(detected_cell_types):
fraction_cell_type = fraction_cell_type.append({'Cell Type' :cell_type, 'Gene Counts': detected_cell_types.count(cell_type)}, ignore_index = True)
fraction_cell_type = fraction_cell_type.set_index('Cell Type').T
fraction_cell_type =fraction_cell_type.div(fraction_cell_type.sum(axis=1),axis=0)
all_cell_types = ['AEC1','AEC2','AEP','BASAL','CILIATED','CLUB','NE']
missing_cell_types = [cell_type for cell_type in all_cell_types if cell_type not in list(fraction_cell_type.columns)]
for missing_type in missing_cell_types:
fraction_cell_type[missing_type]=0
fraction_cell_type = fraction_cell_type.sort_index(axis=1)
fraction_cell_type_array = fraction_cell_type_array.append(fraction_cell_type)
fraction_cell_type_array.index = binary_heatmap_data.index
color_lut = {'AEC1':'#FF66FF',
'AEC2':'#6600CC',
'AEP/MUCINOUS':'#9ACD32',
'KRT5+BASAL':'#0000FF',
'CILIATED':'#3399FF',
'CLUB':'#FF8800',
'PNEC':'#FFD700',
'STEM/PROGENITOR':'#E6E6FA'}
# XY Parameters
plt.figure(figsize = (5,5))
gs1 = gridspec.GridSpec(1,1)
gs1.update(wspace=0.0, hspace=0.05) # set the spacing between axes.
sns.set_style("white")
xaxis_label = 'AEC1'
yaxis_label = 'AEC2'
x = fraction_cell_type_array[xaxis_label]
y = fraction_cell_type_array[yaxis_label]
cell_type = fraction_cell_type_array.index.get_level_values('CELL_TYPE')
phenograph_class = fraction_cell_type_array.index.get_level_values('PHENOGRAPH_CLASS')
source = fraction_cell_type_array.index.get_level_values('Meta-Source')
c = fraction_cell_type_array.index.get_level_values('CELL_TYPE').map(color_lut)
ax1 = plt.subplot(gs1[0])
plot_cell= "STEM/PROGENITOR"
pcs = [0,1] # SELECTING CELLS FROM PHENOGRAPH CLASS 0 AND 1, WHICH SHOW AEC1-AEC2 MIXING
ax = sns.kdeplot(x[(cell_type==plot_cell)*(source=='TUMOR')*([pc in pcs for pc in phenograph_class])],
y[(cell_type==plot_cell)*(source=='TUMOR')*([pc in pcs for pc in phenograph_class])],
n_levels=10, shade=True, cmap = 'Greys',shade_lowest=False, alpha = 1,
)
for path in ax.collections[-7].get_paths():
coords = path.vertices
ax.plot(coords[:,0],coords[:,1],color_lut[plot_cell], lw = 1.2)
plt.xlim((-0.1,1))
plt.ylim((-0.1,1))
plot_cell= yaxis_label
ax = sns.kdeplot(x[(cell_type==plot_cell)*(source=='NOR')],y[(cell_type==plot_cell)*(source=='NOR')],
n_levels=10, shade=True, cmap = 'Purples',shade_lowest=False, alpha = 0.5,
)
for path in ax.collections[-7].get_paths():
coords = path.vertices
ax.plot(coords[:,0],coords[:,1],color_lut[plot_cell], lw = 1.2)
plt.xlim((-0.1,1))
plt.ylim((-0.1,1))
plot_cell= xaxis_label
ax = sns.kdeplot(x[(cell_type==plot_cell)*(source=='NOR')],y[(cell_type==plot_cell)*(source=='NOR')],
n_levels=10, shade=True, cmap = 'PuRd',shade_lowest=False,alpha = 0.5,
)
for path in ax.collections[-7].get_paths():
coords = path.vertices
ax.plot(coords[:,0],coords[:,1],color_lut[plot_cell], lw = 1.2)
#plt.gca().axes.get_xaxis().set_visible(False)
plt.xlim((-0.1,1))
plt.ylim((-0.1,1))
ax.spines['top'].set_linewidth(0.5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_linewidth(0.5)
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)
ax.spines['right'].set_visible(False)
#ax1 = plt.subplot(gs1[1])
# SAVE FIGURE
figure_label = '_{}_lineageMarkersGTQ75_fraction_PC01_x{}_y{}_overlaid'.format(subset_type,xaxis_label,yaxis_label)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
exec('QUERY = NDF_{}'.format(subset_type))
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>biological_variance.mean()
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
goi = list(retain_bool[retain_bool].index)
print(len(goi))
# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# LOAD LINEAGE-SPECIFIC GENE SETS
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets_include = ['AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS','AEP_MORRISEY_2FC']
lineage_genes = np.unique(genesets[genesets_include].values.ravel().tolist())
lineage_genes = [gene for gene in lineage_genes if gene in list(QUERY.columns)]
variable_lineage_genes = [gene for gene in lineage_genes if gene in goi]
print(len(lineage_genes))
print(len(variable_lineage_genes))
# COMPUTE LINEAGE PHENOTYPIC VOLUME
n = 500
reps = np.arange(50) # number of times to downsample cells
gene_subset = variable_lineage_genes # genes used in covariance calculation
print(len(gene_subset))
# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
master_groupby = 'Meta-Source'
class_selection = ['NOR']
ix = [ind for ind,name in enumerate(QUERY.index.names) if name==master_groupby][0]
logvolume_normal = np.zeros(len(reps))
for rep in reps:
SUBSET = QUERY.select(lambda x: x[ix] in class_selection)[gene_subset].sample(n=n,replace=True)
GENE_COV = SUBSET.cov(min_periods=None)
eigenvalues = np.real(np.linalg.eigvals(GENE_COV))
logvolume = np.float(np.sum(0.5*np.log10(eigenvalues[eigenvalues>0]**2))/len(eigenvalues))
logvolume_normal[rep] = logvolume
# GENE-GENE COVARIANCE WITHIN TUMOR SUBSET
master_groupby = 'Meta-Source'
class_selection = ['TUMOR']
ix = [ind for ind,name in enumerate(QUERY.index.names) if name==master_groupby][0]
logvolume_tumor = np.zeros(len(reps))
for rep in reps:
SUBSET = QUERY.select(lambda x: x[ix] in class_selection)[gene_subset].sample(n=n,replace=True)
GENE_COV = SUBSET.cov(min_periods=None)
eigenvalues = np.real(np.linalg.eigvals(GENE_COV))
logvolume = np.float(np.sum(0.5*np.log10(eigenvalues[eigenvalues>0]**2))/len(eigenvalues))
logvolume_tumor[rep] = logvolume
# WRITE RESULTS TO DATAFRAME
boxplot_data_variable_lineage = pd.DataFrame()
boxplot_data_variable_lineage['Meta-Source'] = ['NOR']*len(logvolume_normal)+['TUMOR']*len(logvolume_tumor)
boxplot_data_variable_lineage['Log Phenotypic Volume'] = np.concatenate([logvolume_normal,logvolume_tumor],axis = 0)
# VIOLIN PLOT
fig = plt.figure(figsize = (2,8))
ax = plt.gca()
palette = dict(zip(['TUMOR','NOR'],[[0,0,0],[230/255,230/255,250/255]]))
# Plot violin plot
g = sns.boxplot(x='Meta-Source', y="Log Phenotypic Volume" ,
data=boxplot_data_variable_lineage,palette = palette,notch = True,
linewidth = 1, ax = ax, order = ['NOR','TUMOR']);
g.set_ylabel("Log Lineage Phenotypic Volume",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
ax.spines['bottom'].set_linewidth(1)
ax.spines['left'].set_linewidth(1)
# STATISTIC
T = boxplot_data_variable_lineage.loc[boxplot_data_variable_lineage['Meta-Source'].isin(['TUMOR'])]['Log Phenotypic Volume'].values
N = boxplot_data_variable_lineage.loc[boxplot_data_variable_lineage['Meta-Source'].isin(['NOR'])]['Log Phenotypic Volume'].values
print(stats.mannwhitneyu(T, N))
# SAVE FIGURE
figure_label = '{}_LogVariableLineagePhenotypicVolume_n{}_reps{}'.format(subset_type,n,len(reps)).replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
dimtype = 'ForceDirected'
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
# SOURCE WITH TREATMENT INFO
Rx = [name[len(name)-3:].replace('_','').replace('T4','TR').replace('UTR','').replace('TR','Tx') for name in QUERY.index.get_level_values('Legend')]
Source = QUERY.index.get_level_values('Meta-Source')
Source_Rx = Source+Rx
# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.01, hspace=0.1) # set the spacing between axes.
dot_size = 5
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)
# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=Source_Rx,
cmap=CM_SOURCE_RX,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()
# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('PHENOGRAPH_CLASS'),
cmap= CM_CLASS_ALL_EP,
legend_kwargs={'ncol': 3, 'borderaxespad':0.,
'bbox_to_anchor':(0.75, 0)},
s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')
# CELL TYPES ANNOTATED PREVIOUSLY IN MERGED NORMAL AND PRIMARY TUMOR EPITHELIUM
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'),
cmap= CM_LINEAGE,
legend_kwargs={'ncol': 1, 'borderaxespad':0.,
'bbox_to_anchor':(1, 0)},
s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')
# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# LOAD GENESETS
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets_include = ['AEP_MORRISEY_2FC','AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS']
lineage_genes = np.unique(genesets[genesets_include].values.ravel().tolist())
lut = {}
for geneset in genesets_include:
current_genes = [gene for gene in np.unique(genesets[geneset].values.ravel().tolist()) if gene != 'nan']
current_label = [geneset]*len(current_genes)
current_lut = dict(zip(current_genes,current_label))
lut = dict(lut, **current_lut)
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_LINEAGE
# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR','UNKNOWN']
marker_genes_dict = {
'AEC2': ['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'],
'AEC1': ['PDPN','HOPX','AGER','CAV1','SEMA3B'],
'CLUB': ['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'],
'CILIATED':['FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'],
'BASAL':['KRT14', 'APOE','EMP3','APOC1','CD44'],
'NE': ['UCHL1','ASCL1','CALCA'],
'AEP': ['TM4SF1','MUC5B','LRP5'],
'PROGENITOR TF': ['SOX2','SOX9','ID2','NKX2-1']
}
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['AEC2','AEC1','CLUB','CILIATED','BASAL','NE','AEP','PROGENITOR TF']
# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
genes = marker_genes_dict[gene_group]
genes = [gene for gene in genes if gene in QUERY.columns]
labels = [gene_group] * len(genes)
current = pd.DataFrame({'labels':labels,'genes':genes})
tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values
var_group_positions = []
for gene_group in var_group_labels:
cpositions = list(tmp[tmp['labels'] ==gene_group].index)
var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))
# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
save=None)
# COLORMAP LUT BASED ON QUERY & MET
ind = categories #QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)
# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# Patient mixing within each cluster
meta = 'PHENOGRAPH_CLASS'
exec('QUERY = INDF_{}'.format(subset_type))
FLATUI_PLOT = FLATUI_CLASS_ALL_EP
cell_types = np.unique(QUERY.index.get_level_values(meta))
# To evaluate patient mixing within each cell type, we:
# (1) randomly sampling ncells per cell type
# (2) Compute the Shannon Diversity index based on sample ID within the sampled subset.
# (3) Repeat nreps times to achieve a range of entropy values per cell type.
ncells = 100 # randomly sample this number of cells within each cell type
nreps = 100 # repeat sampling this many times, scaled to cover sample size.
# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
# ENTROPY FOR CELL SUBSETS
H = pd.DataFrame()
for cell_type in cell_types:
# SUBSET CELLS BELONGING TO SPECIFIC CELL TYPE
CELL_TYPE_SUBSET = QUERY.select(lambda x: x[[ind for ind, name in enumerate(QUERY.index.names) if name==meta][0]] in [cell_type])
H_sample = []
for rep in np.arange(nreps):
# Randomly sample 100 Cells from cluster
sampling = CELL_TYPE_SUBSET.sample(n=ncells,replace=True)
# Compute probability of cells sampled for individual patients
labels = sampling.index.get_level_values('Legend')
probs = [np.mean(labels == c) for c in set(labels)]
# Compute entropy of distribution across patients
H_sample.append(np.sum(-p * np.log(p) for p in probs))
# Results for cell type
H = H.append(pd.DataFrame({'Entropy across patients': H_sample, 'Cell Type':[cell_type]*nreps}))
# GROUPED BOXPLOT WITH DATA POINTS OVERLAYED
fig = plt.figure(figsize = (8,3))
ax1 = plt.gca()
sns.set_style("white")
# Rank boxplots by entropy
plot_order = H.set_index('Cell Type').groupby('Cell Type').mean().sort_values(by=['Entropy across patients']).index
g = sns.boxplot(x="Cell Type", y="Entropy across patients",data=H, palette=palette,
fliersize = 4, showmeans=False,linewidth = 1, order = plot_order)
g.set_ylabel("Entropy across patients",fontsize=10,fontname = 'Arial')
g.set_xticklabels(plot_order,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=10)
sns.despine()
# SAVE FIGURE
figure_label = '{}_PATIENT_ENTROPY_PER_{}'.format(subset_type,meta).replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# Fraction detected per metastasis
meta = 'Meta-Source'
exec('tmp = INDF_{}.groupby(level=[\'PHENOGRAPH_CLASS\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(subset_type,meta))
tmp = tmp.div(tmp.sum(axis=1),axis=0)
cluster_ind = list(tmp[tmp['MET']> 0.10].index)
print(cluster_ind)
# LOAD KEY GENESETS
filename = 'stem_cell_signatures_merged'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
# STACKED RANKED BOXPLOTS
meta = 'PHENOGRAPH_CLASS'
FLATUI_PLOT = FLATUI_CLASS_ALL_EP
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in set(QUERY.columns)]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)
violin_data = []
for ind,v in enumerate(SCORE):
violin_data.append({'gene': title, 'Z-normalized Expression': v,
meta:QUERY.index.get_level_values(meta)[ind]})
violin_data = pd.DataFrame(violin_data)
order = list(violin_data[violin_data['gene']==title].groupby(meta).median().sort_values('Z-normalized Expression',ascending = False).index)
met_order = [val for val in order if val in cluster_ind][::-1]
# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))
plot_order = ['GO_LUNG_EPITHELIUM_DEVELOPMENT','ASC_SMITH_CELL2018',
'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',
'CELLULAR_MORPHOGENESIS_DURING_DIFFERENTIATION',
'AEP_MORRISEY_2FC']
# (0,0) Plot violin plot
N = len(plot_order)
# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (2,N*1.25))
sns.set_style("white")
for iii,title in enumerate(plot_order):
# SPECIFY AXIS
ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
# GET GENES
genesets_include = [title]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in set(QUERY.columns)]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)
# Format data structure for violin plot
violin_data = []
for ind,v in enumerate(SCORE):
violin_data.append({'gene': title, 'Z-normalized Expression': v,
meta:QUERY.index.get_level_values(meta)[ind]})
violin_data = pd.DataFrame(violin_data)
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue=meta,data=violin_data, palette=palette,notch = True,
fliersize = 1, showmeans=False,linewidth = 0.5, hue_order = met_order)
g.set_ylabel(" ",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax1.legend_.remove()
vs = violin_data.ix[[ind for ind,val in enumerate(violin_data[meta]) if val in met_order]]['Z-normalized Expression']
ax1.set(ylim=(vs.min()*0.95, vs.max()*1.05))
plt.title("{}".format(title),fontsize=6,fontname = 'Arial',loc = 'left',
rotation = 'horizontal', ha = 'left', pad = 1.5)
plt.xlabel('')
ax1.spines['top'].set_linewidth(0.5)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_linewidth(0.5)
ax1.spines['bottom'].set_linewidth(0.5)
ax1.spines['left'].set_linewidth(0.5)
ax1.spines['right'].set_visible(False)
ax1.tick_params(axis='x', bottom=False, labelbottom=False)
print(met_order)
# SAVE FIGURE
figure_label = '_{}_{}_stackedViolin_markerGenes_by{}_LungEp_mean'.format(subset_type,datatype,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# ROBUSTNESS OF RANKING TO INDIVIDUAL GENES:
# STACKED RANKED BOXPLOTS
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(QUERY.columns)]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
print(len(detected_genes))
# LEAVE ONE OUT ANALYSIS: RANK METASTATIC CLUSTERS BY GO_LUNG_EPITHELIUM_DEVELOPMENT_SIGNATURE less one gene
# TO EVALUATE ROBUSTNESS OF CLUSTERING TO INDIVIDUAL GENE LEVELS
array_met_order = np.zeros((len(detected_genes),len(met_order)))
for ind2 in np.arange(len(detected_genes)):
current_genes = deepcopy(detected_genes)
dropped_value = current_genes.pop(ind2)
vals = QUERY[current_genes]
SCORE = np.nanmean(QUERY[current_genes],axis=1)
violin_data = []
for ind,v in enumerate(SCORE):
violin_data.append({'gene': title, 'Score': v,
meta:QUERY.index.get_level_values(meta)[ind]})
violin_data = pd.DataFrame(violin_data)
order = list(violin_data[violin_data['gene']==title].groupby(meta).median().sort_values('Score',ascending = False).index)
met_order = [int(val) for val in order if val in cluster_ind][::-1]
array_met_order[ind2,:] = met_order
leave_one_out_gene_ranking = pd.DataFrame(data = array_met_order,index = detected_genes)
fig = plt.figure(figsize=(2,5))
ax1 = fig.add_axes([0,0,1,1])
for y_ind in np.arange(leave_one_out_gene_ranking.shape[0]):
row_vals = leave_one_out_gene_ranking.ix[y_ind].values
row_colors = pd.Series(row_vals).map(palette)
print(leave_one_out_gene_ranking.ix[y_ind].name)
x = 0
y = y_ind/(leave_one_out_gene_ranking.shape[0])
for c in row_colors:
pos = (x/len(row_vals),y)
ax1.add_patch(patches.Rectangle(pos,1,1 / (len(row_vals)), color=c,
label=leave_one_out_gene_ranking .ix[y_ind].name))
if x >= len(row_vals)-1:
y += 1
x = 0
else:
x += 1
# SAVE FIGURE
figure_label = '_{}_leave_one_out_rankedmean_wAGR2alone'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# LOAD GENESETS
filename = 'stem_cell_signatures_merged'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])
# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['LUNG_EPITHELIUM_RANK'] =np.nanmean(vals,axis=1)
QUERY['PROLIFERATION'] = np.nanmean(QUERY[['MCM2','PCNA','MKI67']],axis=1)
plot_genes = ['SOX17','HHEX','KRT5','VIM','SNAI2','BMP4','DKK1','SOX2','FGFR2','FOXA2','NKX2-1','WNT7B','SOX9','TM4SF1']#
mat = pd.DataFrame( data = zscore(QUERY.sort_values(by=['LUNG_EPITHELIUM_RANK'])[plot_genes],axis=0),
columns = plot_genes, index = QUERY.index)
window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
mat.iloc[ind] = mat.iloc[half_window]
for ind in np.arange(half_window):
mat.iloc[-ind] = mat.iloc[-half_window]
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(4,10))
# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)
# ROW1 COLORS
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
ind1 = mat.index.get_level_values(meta)
lut = {'I-P': '#F0F0F0', 'I-Q': '#E0E0E0', 'II': '#A0A0A0', 'III': '#202020'}
row_colors1 = pd.Series(ind1).map(lut)
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.08,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
pos = (x, y / len(row_colors1))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
if y >= len(row_colors1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD DENDROGRAM
ax2 = fig.add_axes([0.00,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# SAVE FIGURE
figure_label = '{}_RankedCells_Development_All_tumor_Cells_labeled'.format(subset_type).replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
# LOAD GENESETS
filename = 'stem_cell_signatures_merged'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
# SUBSET ALL TUMOR CELLS
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])
# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['LUNG_EPITHELIUM_RANK'] = np.nanmean(QUERY[genes],axis=1)
QUERY['PROLIFERATION'] = np.nanmean(QUERY[['MCM2','PCNA','MKI67']],axis=1)
QUERY['NK_ACT'] = np.nanmean(QUERY[['ULBP1','ULBP2','RAET1E','RAET1G','RAET1L','MICA','MICB']],axis=1)
QUERY['NK_INH'] = np.nanmean(QUERY[['HLA-A','HLA-B','HLA-C','B2M']],axis=1)
mat = zscore(QUERY.sort_values(by=['LUNG_EPITHELIUM_RANK'])[genes],axis=0)
plot_genes = ['SOX2','SOX9','PROLIFERATION','NK_INH','NK_ACT',
'HLA-A','HLA-B','HLA-C','B2M',
'ULBP1','ULBP2','RAET1E','RAET1G','RAET1L','MICA','MICB']
mat = pd.DataFrame( data = zscore(QUERY.sort_values(by=['LUNG_EPITHELIUM_RANK'])[plot_genes],axis=0),
columns = plot_genes, index = QUERY.index)
window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
mat.iloc[ind] = mat.iloc[half_window]
for ind in np.arange(half_window):
mat.iloc[-ind] = mat.iloc[-half_window]
# HERE WE ARE ONLY VISUALIZING CELLS ASSIGNED TO DEVELOPMENTAL STATES I-Q, II, II FOR SIMPLER VISUALIZATION
# PROLIFERATING TYPE I CELLS (STEM LIKE), ASSOCIATE WITH REGENERATIVE TYPE II CELLS
# MOUSE LCC CELLS ASSOCIATE WITH I-Q STATE
subset_key = 'DEVELOPMENT_PHENOGRAPH_CLASS'
ix = [ind for ind, name in enumerate(mat.index.names) if name==subset_key][0]
mat = mat.select(lambda x: x[ix] in ['I-Q','II','III'])
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(6,10))
# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)
# ROW1 COLORS
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
ind1 = mat.index.get_level_values(meta)
lut = {'I-P': '#F0F0F0', 'I-Q': '#E0E0E0', 'II': '#A0A0A0', 'III': '#202020'}
row_colors1 = pd.Series(ind1).map(lut)
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.08,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
pos = (x, y / len(row_colors1))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
if y >= len(row_colors1)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.00,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# SAVE FIGURE
figure_label = '{}_RankedCells_NKligands_All_tumor_Cells_labeled'.format(subset_type).replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
We profiled the transcriptomes of 14,821 single cells obtained from metastases that evolved from H2087-LCC cells xenograft in athymic mice with and without the selective pressure of NK cell surveillance. Observed metastatic states were mapped to the tumor lung epithelial development states observed in human patient metastasis (section 3) and their distributions were compared in the spontaneous vs. NK cell depleted macrometastases.
h5_data_mouse = H5(DATA_PATH+'MOUSE_LUNG_ADENOCARCINOMA_METASTASIS_ANNOTATED.h5')
h5_data_mouse.ls()
# --- DEFINE ALL COLORMAPS UPFRONT FOR UNIFORMITY ---
# CATEGORICAL: SAMPLE
FLATUI_SAMPLE_M = [
'994C00', #BLIneg
'663300', #BLIneg
'9ACD32', #Early Lung
'6B8E23', #Early Lung
'006400', #Early Lung
'FF9933', # Late Micro Lung Met
'FF9933', # Late Micro Lung Met(b)
]
FLATUI_SOURCE_M = [
'006400', # Aggressive
'8B4513', # BLINEG
'FF9933', # Stable Micromet
]
FLATUI_CLASS_M = [
'0000FF',
'FF0000',
'FF8C00',
'FFD700',
'808000',
'9ACD32',
'006400',
'20B2AA',
'2F4F4F',
'00BFFF',
'191970',
'4B0082',
'8B008B',
'FF00FF',
'FF1493',
'8B4513',
'B0C4DE',
'696969',
'FFB6C1',
'FAEBD7',
'F5DEB3',
'00CCCC',
'BC8F8F',
]
# CONVERT HEX TO RGB (SAMPLES)
colors = np.zeros((len(FLATUI_SAMPLE_M),3))
for ind,hexcolor in enumerate(FLATUI_SAMPLE_M):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SAMPLES_M = LinearSegmentedColormap.from_list('cm_sample', colors, N=len(colors))
# CONVERT HEX TO RGB (SOURCE)
colors = np.zeros((len(FLATUI_SOURCE_M),3))
for ind,hexcolor in enumerate(FLATUI_SOURCE_M):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SOURCE_M = LinearSegmentedColormap.from_list('cm_source', colors, N=len(colors))
# CONVERT HEX TO RGB (CLASS)
colors = np.zeros((len(FLATUI_CLASS_M),3))
for ind,hexcolor in enumerate(FLATUI_CLASS_M):
colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS_M = LinearSegmentedColormap.from_list('cm_class', colors, N=len(colors))
We profiled the transcriptomes of 8,748 single cells obtained from 6 spontaneous metastases isolated from 3 bioluminescence (BLI)-negative organs and 3 BLI-positive macrometastases after intracardiac injection of H2087-LCC latent lung adenocaromina cells in athymic mice. One of the micrometastases isolated from a in vivo BLI-negative organ, showed signal upon ex vivo imaging and therefore was annotated as an incipient lesion (Fig. 4a). These samples were isolated by enzymatic and mechanical digestion, and selected for one passage on a plate in antibiotic to which tumor cell were engineered to be resistant. In vitro selection was required to enrich for the exceedingly rare DTCs from BLI negative organs. By immunofluorescence, we observe these persist as single cells or micrometastatic clusters < 10 cells. They show little to weak organ tropsim, therefore, one passage of selection and screening of multiple organ sites is required to find this rare, but important subpopulation of metastatic cells. Given the selection in antibiotic, only tumor cells were sequenced in this experiment. scRNA-seq data from these selected micro- to macro-metastases were merged to create a global atlas of all stages of lung adenocarcinoma progression observed in this xenograft model of metastasis. In this section, we walk through mapping of these mouse metastatic states to the tumor lung epithelial development states observed in human patient metastasis (section 3). "
subset_type = 'XENOGRAFT_METASTASES_NK'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data_mouse.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data_mouse.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data_mouse.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data_mouse.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data_mouse.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
# SPECIFY DIMENSION TYPE
dimtype = 'ForceDirected'
cm = CM_DIVERGING
# PLOT SAMPLE_ID
plt.figure(figsize = (10,10))
gs1 = gridspec.GridSpec(2, 2)
gs1.update(wspace=0.2, hspace=0.2) # set the spacing between axes.
dot_size = 3
exec('QUERY = INDF_{}'.format(subset_type))
exec('X = DIMENSIONS_{}[\'{}0\']'.format(subset_type,dimtype))
exec('Y = DIMENSIONS_{}[\'{}1\']'.format(subset_type,dimtype))
xmax = np.abs(X).max()
ymax = np.abs(Y).max()
axis_min = -ceil(ceil(max(xmax,ymax))/10)*10
axis_max = ceil(ceil(max(xmax,ymax))/10)*10
# (01) PLOT PHNENGORAPH CLUSTERS
ax = plt.subplot(gs1[0])
meta = 'PHENOGRAPH_CLASS'
seqc.plot.scatter.categorical(X, Y, c= QUERY.index.get_level_values(meta), randomize = True,
cmap=CM_CLASS_M,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
plt.title('Type', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax.legend_.remove()
plt.axis('off')
# (02) PLOT SAMPLE_ID
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(X, Y, c=QUERY.index.get_level_values('Legend'), randomize = True,
cmap=CM_SAMPLES_M, s=dot_size, ax=ax)
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.legend('')
# (03) PLOT SOX2 EXPRESSION
ax = plt.subplot(gs1[2])
phase = zscore(QUERY['SOX2'])
# Plot scatter map with phase score
mask = np.isnan(phase)
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6) # finite data
plt.title('SOX2', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.legend('')
# (03) PLOT SOX9 EXPRESSION
ax = plt.subplot(gs1[3])
phase = zscore(QUERY['SOX9'])
# Plot scatter map with phase score
mask = np.isnan(phase)
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6) # finite data
plt.title('SOX9', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.legend('')
# SAVE FIGURE
figure_label = '_SOX2_SOX9_CLUSTER_{}_{}'.format(dimtype,subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
subset_type = 'XENOGRAFT_METASTASES_NK'
exec('QUERY = NDF_{}'.format(subset_type))
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean())
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
mouse_goi = list(retain_bool[retain_bool].index)
print(len(mouse_goi))
# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
exec('QUERY = NDF_{}'.format(subset_type))
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
human_goi = list(retain_bool[retain_bool].index)
print(len(human_goi))
# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
# SELECT INTERSECTING, VARIABLE GENES
expressed = list(set(mouse_goi).intersection(human_goi))
print(len(expressed))
# HUMAN DATA
# SPECIFY SUBSET
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))
subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])
# Load Data (genes x cells)
patient_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns,
columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta
# Median (replace by Geometric Mean)
patient_lung_ep = patient_lung_ep.groupby(level=0, axis=1).median()
# Transpose the data (clusters x genes)
patient_lung_ep = patient_lung_ep.T
# center features across all clusters per gene
patient_lung_ep -= patient_lung_ep.mean(axis=0)
print(patient_lung_ep.shape)
patient_lung_ep.head()
# MOUSE DATA
# SPECIFY SUBSET
subset_type = 'XENOGRAFT_METASTASES_NK'
meta = 'PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))
# Load Data (genes x cells)
mouse_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns,
columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta
# Assign aggregate value
mouse_lung_ep = mouse_lung_ep.groupby(level=0, axis=1).median()
# Transpose the data (clusters x genes)
mouse_lung_ep = mouse_lung_ep.T
print(mouse_lung_ep.shape)
# ROW COLOR LUT
# CONVERT HEX TO RGB (FLATUI_CLASS)
FLATUI_PLOT = FLATUI_CLASS_M
ind = list(mouse_lung_ep.index)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(sorted(np.unique(ind)), colors[cix,:]))
# center features across all clusters per gene
mouse_lung_ep -= mouse_lung_ep.mean(axis=0)
print(mouse_lung_ep.shape)
mouse_lung_ep.head()
rvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
pvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
stderrs = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
for mouse_ind in mouse_lung_ep.index:
for human_ind,human_ind_name in enumerate(patient_lung_ep.index):
m = mouse_lung_ep.ix[mouse_ind]
h = patient_lung_ep.ix[human_ind]
# Linear Regression
lr = linregress(m, h)
slope = lr[0]
intercept = lr[1]
rvalue = lr[2]
pvalue = lr[3]
stderr = lr[4]
rvalues[mouse_ind,human_ind] = rvalue
pvalues[mouse_ind,human_ind] = pvalue
stderrs[mouse_ind,human_ind] = stderr
# Generate Dataframe
rvalues = pd.DataFrame(
data=rvalues,
columns=list(patient_lung_ep.index),
index=mouse_lung_ep.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'
pvalues = pd.DataFrame(
data=pvalues,
columns=list(patient_lung_ep.index),
index=mouse_lung_ep.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'
stderrs = pd.DataFrame(
data=stderrs,
columns=list(patient_lung_ep.index),
index=mouse_lung_ep.index)
stderrs.columns.name = 'Patient Single Cell Lung Epithelium'
# CLUSTERED HEATMAP OF CORRELATIONS
corrs = rvalues
pvals = pvalues
stds = stderrs
# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average' # single
metric = 'euclidean' # correlation
linkage = hc.linkage(corrs, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(corrs.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = ['I-Q','I-P','II','III']#hc.leaves_list(col_linkage)
mat = corrs.ix[r1,cl]
pvals = pvals.ix[r1,cl]
# REMOVE CLUSTERS NOT SIGNIFICANTLY CORRELATED
keep_rows = np.abs(mat).sum(axis=1)>0
mat = mat.ix[list(keep_rows[keep_rows==True].index)]
pvals = pvals.ix[list(keep_rows[keep_rows==True].index)]
# ROW COLORS
ind = list(mat.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(0.9,4))
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.12,0.1,0.20,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
pos = (x, y / len(row_colors))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
if y >= len(row_colors)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.2,0.1,0.7,0.6])
im = axmatrix.matshow(mat[(pvals<0.05)], aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
# ADD COLORBAR
axcolor = fig.add_axes([1.2,0.1,0.05,0.6])
cbar = plt.colorbar(im, cax=axcolor)
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.038,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# SAVE FIGURE
figure_label = '_PatientLungEp_NDF_{}_CORRELATIONS_AVERAGE_{}_labeled'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
import networkx as nx
import matplotlib.pyplot as plt
sns.set_style("white")
B = nx.Graph()
B.add_nodes_from(corrs.index, bipartite=0)
B.add_nodes_from(corrs.columns, bipartite=1)
list(B)
B.add_weighted_edges_from(
[(idx, 'I-Q',row['I-Q']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
[(idx, 'I-P',row['I-P']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
[(idx, 'II',row['II']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
[(idx, 'III',row['III']+1) for idx, row in corrs.iterrows()],weight='weight')
cutoff = 1.20
SUB_B=nx.Graph( [ (u,v,d) for u,v,d in B.edges(data=True) if d['weight']>cutoff] )
pos = nx.spring_layout(SUB_B)
list(SUB_B)
color_map = []
edge_color_map = []
size_array = []
for n in SUB_B:
if n=='I-P':
color_map.append('#F0F0F0')
edge_color_map.append('black')
size_array.append(2000)
elif n=='I-Q':
color_map.append('#E0E0E0')
edge_color_map.append('black')
size_array.append(2000)
elif n=='II':
color_map.append('#A0A0A0')
edge_color_map.append('black')
size_array.append(2000)
elif n=='III':
color_map.append('#202020')
edge_color_map.append('black')
size_array.append(2000)
else:
color_map.append(lut[n])
edge_color_map.append('white')
size_array.append(1000)
plt.figure(figsize = (10,10))
ax = plt.gca()
nodes = nx.draw_networkx_labels(SUB_B, pos,with_labels=True,node_size = size_array,
node_color = color_map,fontsize=3,fontcolor = 'white')
nodes = nx.draw_networkx_nodes(SUB_B, pos,with_labels=True,node_size = size_array,
node_color = color_map)
nodes.set_edgecolor(edge_color_map)
#4 c. Plot the edges - one by one!
all_weights = []
for (node1,node2,data) in SUB_B.edges(data=True):
all_weights.append(data['weight']) #we'll use this when determining edge thickness
#4 b. Get unique weights
unique_weights = list(set(all_weights))
for weight in unique_weights:
weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in SUB_B.edges(data=True) if edge_attr['weight']==weight]
width = weight*len(SUB_B)/sum(all_weights)*3
nx.draw_networkx_edges(SUB_B,pos,edgelist=weighted_edges,width=width)
# SAVE FIGURE
FIG_output_stem = './figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_'+now+'/'
figure_label = 'BipartiteGraph-Mouse-Human-Correlation-Labeled'
fn = FIG_output_stem + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
Cells were assigned to quiescent (I-Q), regenerating (I-P/II), and SOX9 high escape (III) as visualized in the bipartite graph above. Clusters that did not signficiantly associate with any of these developmental states (not shown in bipartite) were annotated as 'NA'. All annotations are saved in the multi-index of each dataframe under index name: DEVELOPMENTAL_STATE.
subset_type = 'XENOGRAFT_METASTASES_NK'
meta = 'Meta-Source'
exec('fractions = DF_{}.groupby(level=[\'PHENOGRAPH_CLASS\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(subset_type,meta))
tissue_cluster_sizes = fractions.iloc[:,:3]
colors = [
'#006400', # AGGRESSIVE
'#8B4513', # BLINEG
'#FF9933', # STABLE MICROMET
]
# Plot parameters
nrow = int(ceil(sqrt(len(tissue_cluster_sizes.index))))
ncol = int(ceil(sqrt(len(tissue_cluster_sizes.index))))
plt.figure(figsize = (10,10))
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.25, hspace=0.1) # set the spacing between axes.
sns.set(font_scale=2.5)
for ind in np.arange(tissue_cluster_sizes.shape[0]):
ax = plt.subplot(gs1[ind])
plt.pie(
tissue_cluster_sizes.iloc[ind,:],
shadow=False,
colors=colors,
)
# View the plot drop above
plt.axis('equal')
plt.xlabel(tissue_cluster_sizes.index[ind], fontsize = 10)
# SAVE FIGURE
FIG_output_stem = './figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_'+now+'/'
figure_label = 'BipartiteGraph-Mouse-Human-Correlation-PiePlots-Filled'
fn = FIG_output_stem + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
# PLOT SAMPLE_ID
subset_type = 'XENOGRAFT_METASTASES_NK'
plt.figure(figsize = (12,5))
gs1 = gridspec.GridSpec(1,4)
gs1.update(wspace=0.35, hspace=0.2) # set the spacing between axes.
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
sns.set_style("white")
subset_key = 'PHENOGRAPH_CLASS'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
# PHENOGRAPH CLUSTERS THAT SIGNFICANTLY MAPPED TO HUMAN LUNG EP DEVELOPMENT STATE
QUERY = QUERY.select(lambda x: x[ix] in [3,4,10,0,1,5,8,9,16,6,15,12,2,17,13,11,7])
# ANNOATIONS FROM BI-PARTITE GRAPH
development_class_lut = {3:'Escape',
4:'Escape',
10:'Escape',
0:'Escape',
1:'Regenerating',
5:'Regenerating',
8:'Regenerating',
9:'Regenerating',
16: 'Quiescent',
6: 'Quiescent',
15: 'Quiescent',
12: 'Quiescent',
2: 'Quiescent',
17: 'Quiescent',
13: 'Quiescent',
11: 'Quiescent',
7: 'Quiescent' }
ind = QUERY.index.get_level_values(subset_key).map(development_class_lut)
palette = {'Quiescent': '#E0E0E0', 'Regenerating': '#A0A0A0', 'Escape': '#202020'}
order = ['Quiescent','Regenerating','Escape']
##############################################################################################################
ax = plt.subplot(gs1[0])
title= 'SOX2'
genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)
# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
violin_data.append({'gene': title, 'Z-normalized Expression': v,
'Development_Class':ind[ii]})
violin_data = pd.DataFrame(violin_data)
# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True,
hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels,
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)
# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values
print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))
print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))
print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################
ax = plt.subplot(gs1[1])
title= 'SOX9'
genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)
# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
violin_data.append({'gene': title, 'Z-normalized Expression': v,
'Development_Class':ind[ii]})
violin_data = pd.DataFrame(violin_data)
# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True,
hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels,
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)
# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values
print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))
print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))
print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################
ax = plt.subplot(gs1[2])
title = 'NK_ACTIVATING'
genes = ['ULBP1','ULBP2','RAET1E','RAET1G','RAET1L','MICA','MICB']
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)
# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
violin_data.append({'gene': title, 'Z-normalized Expression': v,
'Development_Class':ind[ii]})
violin_data = pd.DataFrame(violin_data)
# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True,
hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels,
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)
# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values
print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))
print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))
print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################
ax = plt.subplot(gs1[3])
title = 'NK_INHIBITORY'
genes = ['HLA-A','HLA-B','HLA-C','B2M']
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)
# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
violin_data.append({'gene': title, 'Z-normalized Expression': v,
'Development_Class':ind[ii]})
violin_data = pd.DataFrame(violin_data)
# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True,
hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels,
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)
# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values
print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))
print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))
print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################
# SAVE FIGURE
figure_label = '_boxplots_NKligands_{}_developmentclass'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
We profiled the transcriptomes of 6,073 single cells obtained from 8 H2087-LCC macro-metastases, whose outgrowth was triggered by NK cell depletion (IP administration of anti-GM1). Tumor cells were freshly isolated from enzymatically and mechanically dissocaited macrometastases by flow sorting of GFP+ cells. Macrometastatic growths were identified and tracked by longitudinal bioluminescence imaging. scRNA-seq data from these freshly isoalted macrometastatic outgrowths were merged to create a global atlas of cell states observed in macrometastases that evolved without the selective pressure of NK cell surveillance. In this section, we walk through mapping of these NK cell-depleted mouse metastatic states to the tumor lung epithelial development states observed in human patient metastasis (section 3).
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data_mouse.load(\'/DF_{}\')'.format(subset_type,subset_type))
# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data_mouse.load(\'/NDF_{}\')'.format(subset_type,subset_type))
# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data_mouse.load(\'/INDF_{}\')'.format(subset_type,subset_type))
# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data_mouse.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data_mouse.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
# SPECIFY DIMENSION TYPE
dimtype = 'ForceDirected'
cm = CM_DIVERGING
# PLOT SAMPLE_ID
plt.figure(figsize = (10,10))
gs1 = gridspec.GridSpec(2, 2)
gs1.update(wspace=0.2, hspace=0.2) # set the spacing between axes.
dot_size = 3
exec('QUERY = INDF_{}'.format(subset_type))
exec('X = DIMENSIONS_{}[\'{}0\']'.format(subset_type,dimtype))
exec('Y = DIMENSIONS_{}[\'{}1\']'.format(subset_type,dimtype))
xmax = np.abs(X).max()
ymax = np.abs(Y).max()
axis_min = -ceil(ceil(max(xmax,ymax))/10)*10
axis_max = ceil(ceil(max(xmax,ymax))/10)*10
# (01) PLOT PHNENGORAPH CLUSTERS
ax = plt.subplot(gs1[0])
meta = 'PHENOGRAPH_CLASS'
seqc.plot.scatter.categorical(X, Y, c= QUERY.index.get_level_values(meta), randomize = True,
cmap=CM_CLASS_M,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
plt.title('Type', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax.legend_.remove()
plt.axis('off')
# (02) PLOT SAMPLE_ID
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(X, Y, c=QUERY.index.get_level_values('Legend'), randomize = True,
cmap=CM_CLASS_M, s=dot_size, ax=ax)
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.legend('')
# (03) PLOT SOX2 EXPRESSION
ax = plt.subplot(gs1[2])
phase = zscore(QUERY['SOX2'])
# Plot scatter map with phase score
mask = np.isnan(phase)
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6) # finite data
plt.title('SOX2', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.legend('')
# (03) PLOT SOX9 EXPRESSION
ax = plt.subplot(gs1[3])
phase = zscore(QUERY['SOX9'])
# Plot scatter map with phase score
mask = np.isnan(phase)
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6) # finite data
plt.title('SOX9', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.legend('')
# SAVE FIGURE
figure_label = '_SOX2_SOX9_CLUSTER_{}_{}'.format(dimtype,subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
exec('QUERY = NDF_{}'.format(subset_type))
matplotlib.rc('xtick', labelsize=12)
matplotlib.rc('ytick', labelsize=12)
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency', fontsize=12)
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)', fontsize=12)
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean())
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)', fontsize=12)
plt.ylabel('variance log10(expression)', fontsize=12)
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
mouse_goi = list(retain_bool[retain_bool].index)
print(len(mouse_goi))
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
exec('QUERY = NDF_{}'.format(subset_type))
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes.
# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS
ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency', fontsize=12)
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)', fontsize=12)
plt.grid(True)
sns.despine()
# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)
x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)
# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())
ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)', fontsize=12)
plt.ylabel('variance log10(expression)', fontsize=12)
# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
human_goi = list(retain_bool[retain_bool].index)
print(len(human_goi))
# SELECT INTERSECTING, VARIABLE GENES
expressed = list(set(mouse_goi).intersection(human_goi))
print(len(expressed))
# HUMAN DATA
# SPECIFY SUBSET
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))
subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])
# Load Data (genes x cells)
patient_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns,
columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta
# Median (replace by Geometric Mean)
patient_lung_ep = patient_lung_ep.groupby(level=0, axis=1).median()
# Transpose the data (clusters x genes)
patient_lung_ep = patient_lung_ep.T
# center features across all clusters per gene
patient_lung_ep -= patient_lung_ep.mean(axis=0)
print(patient_lung_ep.shape)
patient_lung_ep.head()
# MOUSE DATA
# SPECIFY SUBSET
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
meta = 'PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))
# Load Data (genes x cells)
mouse_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns,
columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta
# Assign aggregate value
mouse_lung_ep = mouse_lung_ep.groupby(level=0, axis=1).median()
# Transpose the data (clusters x genes)
mouse_lung_ep = mouse_lung_ep.T
print(mouse_lung_ep.shape)
# ROW COLOR LUT
# CONVERT HEX TO RGB (FLATUI_CLASS)
FLATUI_PLOT = FLATUI_CLASS_M
ind = list(mouse_lung_ep.index)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(sorted(np.unique(ind)), colors[cix,:]))
# center features across all clusters per gene
mouse_lung_ep -= mouse_lung_ep.mean(axis=0)
print(mouse_lung_ep.shape)
mouse_lung_ep.head()
rvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
pvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
stderrs = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
for mouse_ind in mouse_lung_ep.index:
for human_ind,human_ind_name in enumerate(patient_lung_ep.index):
m = mouse_lung_ep.ix[mouse_ind]
h = patient_lung_ep.ix[human_ind]
# Linear Regression
lr = linregress(m, h)
slope = lr[0]
intercept = lr[1]
rvalue = lr[2]
pvalue = lr[3]
stderr = lr[4]
rvalues[mouse_ind,human_ind] = rvalue
pvalues[mouse_ind,human_ind] = pvalue
stderrs[mouse_ind,human_ind] = stderr
# Generate Dataframe
rvalues = pd.DataFrame(
data=rvalues,
columns=list(patient_lung_ep.index),
index=mouse_lung_ep.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'
pvalues = pd.DataFrame(
data=pvalues,
columns=list(patient_lung_ep.index),
index=mouse_lung_ep.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'
stderrs = pd.DataFrame(
data=stderrs,
columns=list(patient_lung_ep.index),
index=mouse_lung_ep.index)
stderrs.columns.name = 'Patient Single Cell Lung Epithelium'
# CLUSTERED HEATMAP OF CORRELATIONS
corrs = rvalues
pvals = pvalues
stds = stderrs
# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average' # single
metric = 'euclidean' # correlation
linkage = hc.linkage(corrs, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(corrs.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = ['I-Q','I-P','II','III']#hc.leaves_list(col_linkage)
mat = corrs.ix[r1,cl]
pvals = pvals.ix[r1,cl]
# REMOVE CLUSTERS NOT SIGNIFICANTLY CORRELATED
keep_rows = np.abs(mat).sum(axis=1)>0
mat = mat.ix[list(keep_rows[keep_rows==True].index)]
pvals = pvals.ix[list(keep_rows[keep_rows==True].index)]
# ROW COLORS
ind = list(mat.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM
fig = plt.figure(figsize=(0.9,4))
# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.12,0.1,0.20,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
pos = (x, y / len(row_colors))
ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
if y >= len(row_colors)-1:
x += 1
y = 0
else:
y += 1
plt.axis('off')
# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.2,0.1,0.7,0.6])
im = axmatrix.matshow(mat[(pvals<0.05)], aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
# ADD COLORBAR
axcolor = fig.add_axes([1.2,0.1,0.05,0.6])
cbar = plt.colorbar(im, cax=axcolor)
# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.038,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')
# SAVE FIGURE
figure_label = '_PatientLungEp_NDF_{}_CORRELATIONS_AVERAGE_{}_labeled'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
import networkx as nx
import matplotlib.pyplot as plt
sns.set_style("white")
B = nx.Graph()
B.add_nodes_from(corrs.index, bipartite=0)
B.add_nodes_from(corrs.columns, bipartite=1)
list(B)
B.add_weighted_edges_from(
[(idx, 'I-Q',row['I-Q']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
[(idx, 'I-P',row['I-P']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
[(idx, 'II',row['II']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
[(idx, 'III',row['III']+1) for idx, row in corrs.iterrows()],weight='weight')
cutoff = 1.20
SUB_B=nx.Graph( [ (u,v,d) for u,v,d in B.edges(data=True) if d['weight']>cutoff] )
pos = nx.spring_layout(SUB_B)
list(SUB_B)
color_map = []
edge_color_map = []
size_array = []
for n in SUB_B:
if n=='I-P':
color_map.append('#F0F0F0')
edge_color_map.append('black')
size_array.append(2000)
elif n=='I-Q':
color_map.append('#E0E0E0')
edge_color_map.append('black')
size_array.append(2000)
elif n=='II':
color_map.append('#A0A0A0')
edge_color_map.append('black')
size_array.append(2000)
elif n=='III':
color_map.append('#202020')
edge_color_map.append('black')
size_array.append(2000)
else:
color_map.append(lut[n])
edge_color_map.append('white')
size_array.append(1000)
plt.figure(figsize = (10,10))
ax = plt.gca()
nodes = nx.draw_networkx_labels(SUB_B, pos,with_labels=True,node_size = size_array,
node_color = color_map,fontsize=3,fontcolor = 'white')
nodes = nx.draw_networkx_nodes(SUB_B, pos,with_labels=True,node_size = size_array,
node_color = color_map)
nodes.set_edgecolor(edge_color_map)
#4 c. Plot the edges - one by one!
all_weights = []
for (node1,node2,data) in SUB_B.edges(data=True):
all_weights.append(data['weight']) #we'll use this when determining edge thickness
#4 b. Get unique weights
unique_weights = list(set(all_weights))
for weight in unique_weights:
weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in SUB_B.edges(data=True) if edge_attr['weight']==weight]
width = weight*len(SUB_B)/sum(all_weights)*3
nx.draw_networkx_edges(SUB_B,pos,edgelist=weighted_edges,width=width)
# SAVE FIGURE
FIG_output_stem = './figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_'+now+'/'
figure_label = 'BipartiteGraph-NKdepletedMouse-Human-Correlation-Labeled'
fn = FIG_output_stem + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
Cells were assigned to quiescent (I-Q), regenerating (I-P/II), and SOX9 high escape (III) as visualized in the bipartite graph above. Clusters that did not signficiantly associate with any of these developmental states (not shown in bipartite) were annotated as 'NA'. All annotations are saved in the multi-index of each dataframe under index name: DEVELOPMENTAL_STATE.
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
meta = 'DEVELOPMENTAL_STATE'
exec('QUERY = INDF_{}'.format(subset_type))
tissue_cluster_sizes = QUERY.groupby(level=['Legend', '{}'.format(meta)], axis=0).size().unstack().fillna(0)
NK_DEPLETED = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0)
NK_DEPLETED['I/II'] = NK_DEPLETED['I-P/II']+NK_DEPLETED['I-Q']
NK_DEPLETED['TYPE'] = ['NK_DEPLETED']*NK_DEPLETED.shape[0]
NK_DEPLETED
subset_type = 'XENOGRAFT_METASTASES_NK'
meta = 'DEVELOPMENTAL_STATE'
exec('QUERY = INDF_{}'.format(subset_type))
tissue_cluster_sizes = QUERY.groupby(level=['Legend', '{}'.format(meta)], axis=0).size().unstack().fillna(0)
SPONTANEOUS = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0)
SPONTANEOUS['I/II'] = SPONTANEOUS['I-P/II']+SPONTANEOUS['I-Q']
# SELECT AGGRESSIVE MACROMETASTSESE FOR COMPARISON
SPONTANEOUS = SPONTANEOUS.ix[['EarlySpont_Lung_M4766','EarlySpont_Lung_M4768','EarlySpont_Lung_M4770']]
SPONTANEOUS['TYPE'] = ['Spontaneous']*SPONTANEOUS.shape[0]
SPONTANEOUS
# SOURCE, CELL TYPE, FRACTION
outbreaks = SPONTANEOUS.append(NK_DEPLETED)
plot_vals = ['I-Q','I-P/II','I/II','III','NA']
boxplot_data = pd.DataFrame()
for val in plot_vals:
current_df = pd.DataFrame()
current_df['FRACTION'] = outbreaks[val]
current_df['TYPE'] = outbreaks['TYPE'].values
current_df['STATE'] = [val]*outbreaks.shape[0]
boxplot_data = boxplot_data.append(current_df)
# GROUPED BOXPLOT WITH DATA POINTS OVERLAYED
fig = plt.figure(figsize = (3,5))
ax1 = plt.gca()
sns.set_style("white")
labels = ['NA','I/II','III']
palette = {'NK_DEPLETED':'white',
'Spontaneous':'black'}
ax = sns.boxplot(x="STATE", y="FRACTION", hue="TYPE",data=boxplot_data, #palette=palette,
hue_order = ['Spontaneous','NK_DEPLETED'],
order = labels, fliersize = 4, showmeans=False,linewidth = 1, notch = False)
# iterate over boxes
for i,box in enumerate(ax.artists):
box.set_edgecolor('black')
box.set_facecolor('white')
# iterate over whiskers and median lines
for j in range(6*i,6*(i+1)):
ax.lines[j].set_color('black')
g = sns.swarmplot(x="STATE", y="FRACTION", hue="TYPE",data=boxplot_data,
palette=palette,
size = 5,edgecolor='#000000', linewidth=1,
hue_order = ['Spontaneous','NK_DEPLETED'],order = labels, split=True)
g.set_ylabel("\n\n\nFraction of metastasis",fontsize=14,fontname = 'Arial')
g.set_xticklabels(labels,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=14)
sns.despine()
plt.ylim((0,1))
ax1.legend_.remove()